test: rename previous test, cleanup, still does not work

fix: provide better context for the other test
test: actually duplicate L1 layer in test
2026-01-31 17:20:37 +00:00 · 2023-08-30 14:19:54 +03:00 · 2023-08-30 14:19:54 +03:00 · 2023-08-30 14:19:54 +03:00 · 2023-08-30 14:19:54 +03:00 · 2023-08-30 10:31:56 +03:00
189 changed files with 14942 additions and 7020 deletions
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -1,7 +1,20 @@
 name: 'Create Allure report'
 description: 'Generate Allure report from uploaded by actions/allure-report-store tests results'

+inputs:
+  store-test-results-into-db:
+    description: 'Whether to store test results into the database. TEST_RESULT_CONNSTR/TEST_RESULT_CONNSTR_NEW should be set'
+    type: boolean
+    required: false
+    default: false
+
 outputs:
+  base-url:
+    description: 'Base URL for Allure report'
+    value: ${{ steps.generate-report.outputs.base-url }}
+  base-s3-url:
+    description: 'Base S3 URL for Allure report'
+    value: ${{ steps.generate-report.outputs.base-s3-url }}
  report-url:
    description: 'Allure report URL'
    value: ${{ steps.generate-report.outputs.report-url }}
@@ -63,8 +76,8 @@ runs:
          rm -f ${ALLURE_ZIP}
        fi
      env:
-        ALLURE_VERSION: 2.22.1
-        ALLURE_ZIP_SHA256: fdc7a62d94b14c5e0bf25198ae1feded6b005fdbed864b4d3cb4e5e901720b0b
+        ALLURE_VERSION: 2.23.1
+        ALLURE_ZIP_SHA256: 11141bfe727504b3fd80c0f9801eb317407fd0ac983ebb57e671f14bac4bcd86

    # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
    - name: Acquire lock
@@ -102,6 +115,11 @@ runs:
        REPORT_PREFIX=reports/${BRANCH_OR_PR}
        RAW_PREFIX=reports-raw/${BRANCH_OR_PR}/${GITHUB_RUN_ID}

+        BASE_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}
+        BASE_S3_URL=s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}
+        REPORT_URL=${BASE_URL}/index.html
+        REPORT_JSON_URL=${BASE_URL}/data/suites.json
+
        # Get previously uploaded data for this run
        ZSTD_NBTHREADS=0

@@ -110,10 +128,9 @@ runs:
          # There's no previously uploaded data for this $GITHUB_RUN_ID
          exit 0
        fi
-        for S3_FILEPATH in ${S3_FILEPATHS}; do
-          time aws s3 cp --only-show-errors "s3://${BUCKET}/${S3_FILEPATH}" "${WORKDIR}"

-          archive=${WORKDIR}/$(basename $S3_FILEPATH)
+        time aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${RAW_PREFIX}/" "${WORKDIR}/"
+        for archive in $(find ${WORKDIR} -name "*.tar.zst"); do
          mkdir -p ${archive%.tar.zst}
          time tar -xf ${archive} -C ${archive%.tar.zst}
          rm -f ${archive}
@@ -130,9 +147,10 @@ runs:

        # Upload a history and the final report (in this particular order to not to have duplicated history in 2 places)
        time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report/history" "s3://${BUCKET}/${REPORT_PREFIX}/latest/history"
-        time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"

-        REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html
+        # Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
+        # and to keep files on the host to upload them to the database
+        time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"

        # Generate redirect
        cat <<EOF > ${WORKDIR}/index.html
@@ -144,8 +162,10 @@ runs:
        EOF
        time aws s3 cp --only-show-errors ${WORKDIR}/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"

-        echo "report-url=${REPORT_URL}"                                   >> $GITHUB_OUTPUT
-        echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT
+        echo "base-url=${BASE_URL}"               >> $GITHUB_OUTPUT
+        echo "base-s3-url=${BASE_S3_URL}"         >> $GITHUB_OUTPUT
+        echo "report-url=${REPORT_URL}"           >> $GITHUB_OUTPUT
+        echo "report-json-url=${REPORT_JSON_URL}" >> $GITHUB_OUTPUT

        echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}

@@ -159,6 +179,41 @@ runs:
          aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
        fi

+    - name: Store Allure test stat in the DB
+      if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
+      shell: bash -euxo pipefail {0}
+      env:
+        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+        REPORT_JSON_URL: ${{ steps.generate-report.outputs.report-json-url }}
+      run: |
+        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR}
+
+        ./scripts/pysync
+
+        poetry run python3 scripts/ingest_regress_test_result.py \
+          --revision ${COMMIT_SHA} \
+          --reference ${GITHUB_REF} \
+          --build-type unified \
+          --ingest ${WORKDIR}/report/data/suites.json
+
+    - name: Store Allure test stat in the DB (new)
+      if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
+      shell: bash -euxo pipefail {0}
+      env:
+        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+        BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }}
+      run: |
+        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW}
+
+        ./scripts/pysync
+
+        poetry run python3 scripts/ingest_regress_test_result-new-format.py \
+          --reference ${GITHUB_REF} \
+          --revision ${COMMIT_SHA} \
+          --run-id ${GITHUB_RUN_ID} \
+          --run-attempt ${GITHUB_RUN_ATTEMPT} \
+          --test-cases-dir ${WORKDIR}/report/data/test-cases
+
    - name: Cleanup
      if: always()
      shell: bash -euxo pipefail {0}
--- a/.github/actions/download/action.yml
+++ b/.github/actions/download/action.yml
@@ -31,7 +31,7 @@ runs:
        BUCKET=neon-github-public-dev
        FILENAME=$(basename $ARCHIVE)

-        S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
+        S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
        if [ -z "${S3_KEY}" ]; then
          if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then
            echo 'SKIPPED=true' >> $GITHUB_OUTPUT
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -432,6 +432,11 @@ jobs:
        if: ${{ !cancelled() }}
        id: create-allure-report
        uses: ./.github/actions/allure-report-generate
+        with:
+          store-test-results-into-db: true
+        env:
+          REGRESS_TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
+          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}

      - uses: actions/github-script@v6
        if: ${{ !cancelled() }}
@@ -452,25 +457,6 @@ jobs:
              report,
            })

-      - name: Store Allure test stat in the DB
-        if: ${{ !cancelled() && steps.create-allure-report.outputs.report-json-url }}
-        env:
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          REPORT_JSON_URL: ${{ steps.create-allure-report.outputs.report-json-url }}
-          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
-        run: |
-          ./scripts/pysync
-
-          curl --fail --output suites.json "${REPORT_JSON_URL}"
-          export BUILD_TYPE=unified
-          export DATABASE_URL="$TEST_RESULT_CONNSTR"
-
-          poetry run python3 scripts/ingest_regress_test_result.py \
-            --revision ${COMMIT_SHA} \
-            --reference ${GITHUB_REF} \
-            --build-type ${BUILD_TYPE} \
-            --ingest suites.json
-
  coverage-report:
    runs-on: [ self-hosted, gen3, small ]
    container:
@@ -751,34 +737,6 @@ jobs:
                           --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
                           --cleanup

-      # Due to a kaniko bug, we can't use cache for extensions image, thus it takes about the same amount of time as compute-node image to build (~10 min)
-      # During the transition period we need to have extensions in both places (in S3 and in compute-node image),
-      # so we won't build extension twice, but extract them from compute-node.
-      #
-      # For now we use extensions image only for new custom extensitons
-      - name: Kaniko build extensions only
-        run: |
-          # Kaniko is suposed to clean up after itself if --cleanup flag is set, but it doesn't.
-          # Despite some fixes were made in https://github.com/GoogleContainerTools/kaniko/pull/2504 (in kaniko v1.11.0),
-          # it still fails with error:
-          #   error building image: could not save file: copying file: symlink postgres /kaniko/1/usr/local/pgsql/bin/postmaster: file exists
-          #
-          # Ref https://github.com/GoogleContainerTools/kaniko/issues/1406
-          find /kaniko -maxdepth 1 -mindepth 1 -type d -regex "/kaniko/[0-9]*" -exec rm -rv {} \;
-
-          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
-                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
-                           --context . \
-                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} \
-                           --build-arg PG_VERSION=${{ matrix.version }} \
-                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
-                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
-                           --dockerfile Dockerfile.compute-node \
-                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-                           --destination neondatabase/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-                           --cleanup \
-                           --target postgres-extensions
-
      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr
@@ -794,7 +752,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.13.1
+      VM_BUILDER_VERSION: v0.17.5

    steps:
      - name: Checkout
@@ -815,7 +773,11 @@ jobs:

      - name: Build vm image
        run: |
-          ./vm-builder -enable-file-cache -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+          ./vm-builder \
+            -enable-file-cache \
+            -cgroup-uid=postgres \
+            -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
+            -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

      - name: Pushing vm-compute-node image
        run: |
@@ -896,10 +858,8 @@ jobs:
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} latest

      - name: Push images to production ECR
        if: |
@@ -910,10 +870,8 @@ jobs:
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:latest

      - name: Configure Docker Hub login
        run: |
@@ -935,65 +893,56 @@ jobs:
          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/extensions-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/extensions-v15:${{needs.tag.outputs.build-tag}} latest

      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr

-  upload-postgres-extensions-to-s3:
-    if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
-       github.event_name != 'workflow_dispatch'
-    runs-on: ${{ github.ref_name == 'release' && fromJSON('["self-hosted", "prod", "x64"]') || fromJSON('["self-hosted", "gen3", "small"]') }}
-    needs: [ tag, promote-images ]
-    strategy:
-      fail-fast: false
-      matrix:
-        version: [ v14, v15 ]
-
-    env:
-      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
-      AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
-      AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
-      S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}
-
+  build-private-extensions:
+    runs-on: [ self-hosted, gen3, small ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+      options: --init
+    needs: [ tag ]
    steps:
-      - name: Pull postgres-extensions image
+      - name: Set PR's status to pending and request a remote CI test
        run: |
-          docker pull ${EXTENSIONS_IMAGE}
+          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
+          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
+          REMOTE_REPO="${{ github.repository_owner }}/build-custom-extensions"

-      - name: Create postgres-extensions container
-        id: create-container
-        run: |
-          EID=$(docker create ${EXTENSIONS_IMAGE} true)
-          echo "EID=${EID}" >> $GITHUB_OUTPUT
+          curl -f -X POST \
+          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"state\": \"pending\",
+              \"context\": \"build-and-upload-extensions\",
+              \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
+            }"

-      - name: Extract postgres-extensions from container
-        run: |
-          rm -rf ./extensions-to-upload # Just in case
-          mkdir -p extensions-to-upload
-
-          docker cp ${{ steps.create-container.outputs.EID }}:/extensions/ ./extensions-to-upload/
-          docker cp ${{ steps.create-container.outputs.EID }}:/ext_index.json ./extensions-to-upload/
-
-      - name: Upload postgres-extensions to S3
-        run: |
-          for BUCKET in $(echo ${S3_BUCKETS:-[]} | jq --raw-output '.[]'); do
-            aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
-          done
-
-      - name: Cleanup
-        if: ${{ always() && steps.create-container.outputs.EID }}
-        run: |
-          docker rm ${{ steps.create-container.outputs.EID }} || true
+          curl -f -X POST \
+          https://api.github.com/repos/$REMOTE_REPO/actions/workflows/build_and_upload_extensions.yml/dispatches \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"ref\": \"main\",
+              \"inputs\": {
+                \"ci_job_name\": \"build-and-upload-extensions\",
+                \"commit_hash\": \"$COMMIT_SHA\",
+                \"remote_repo\": \"${{ github.repository }}\",
+                \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
+                \"remote_branch_name\": \"${{ github.ref_name }}\"
+              }
+            }"

  deploy:
    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    needs: [ upload-postgres-extensions-to-s3, promote-images, tag, regress-tests ]
+    needs: [ promote-images, tag, regress-tests ]
    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
    steps:
      - name: Fix git ownership
@@ -1067,7 +1016,7 @@ jobs:
            OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
            FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst

-            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
+            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
            if [ -z "${S3_KEY}" ]; then
              echo >&2 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
              exit 1
--- a/13
+++ b/13
@@ -1,11 +1,12 @@
-/compute_tools/ @neondatabase/control-plane
+/compute_tools/ @neondatabase/control-plane @neondatabase/compute
 /control_plane/ @neondatabase/compute @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
-/libs/postgres_ffi/ @neondatabase/compute 
-/libs/remote_storage/ @neondatabase/storage 
-/libs/safekeeper_api/ @neondatabase/safekeepers  
-/pageserver/ @neondatabase/compute @neondatabase/storage 
+/libs/postgres_ffi/ @neondatabase/compute
+/libs/remote_storage/ @neondatabase/storage
+/libs/safekeeper_api/ @neondatabase/safekeepers
+/libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
+/pageserver/ @neondatabase/compute @neondatabase/storage
 /pgxn/ @neondatabase/compute
-/proxy/ @neondatabase/control-plane 
+/proxy/ @neondatabase/proxy
 /safekeeper/ @neondatabase/safekeepers
 /vendor/ @neondatabase/compute
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -190,7 +190,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -201,7 +201,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -553,12 +553,13 @@ dependencies = [

 [[package]]
 name = "axum"
-version = "0.6.18"
+version = "0.6.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8175979259124331c1d7bf6586ee7e0da434155e4b2d48ec2c8386281d8df39"
+checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf"
 dependencies = [
 "async-trait",
 "axum-core",
+ "base64 0.21.1",
 "bitflags",
 "bytes",
 "futures-util",
@@ -573,7 +574,13 @@ dependencies = [
 "pin-project-lite",
 "rustversion",
 "serde",
+ "serde_json",
+ "serde_path_to_error",
+ "serde_urlencoded",
+ "sha1",
 "sync_wrapper",
+ "tokio",
+ "tokio-tungstenite 0.20.0",
 "tower",
 "tower-layer",
 "tower-service",
@@ -639,6 +646,12 @@ dependencies = [
 "vsimd",
 ]

+[[package]]
+name = "base64ct"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
+
 [[package]]
 name = "bincode"
 version = "1.3.3"
@@ -667,7 +680,7 @@ dependencies = [
 "regex",
 "rustc-hash",
 "shlex",
- "syn 2.0.16",
+ "syn 2.0.28",
 "which",
 ]

@@ -740,6 +753,9 @@ name = "cc"
 version = "1.0.79"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
+dependencies = [
+ "jobserver",
+]

 [[package]]
 name = "cexpr"
@@ -756,6 +772,19 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"

+[[package]]
+name = "cgroups-rs"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fb3af90c8d48ad5f432d8afb521b5b40c2a2fce46dd60e05912de51c47fba64"
+dependencies = [
+ "libc",
+ "log",
+ "nix 0.25.1",
+ "regex",
+ "thiserror",
+]
+
 [[package]]
 name = "chrono"
 version = "0.4.24"
@@ -840,7 +869,7 @@ dependencies = [
 "heck",
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -883,6 +912,8 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "chrono",
+ "regex",
+ "remote_storage",
 "serde",
 "serde_json",
 "serde_with",
@@ -896,6 +927,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-compression",
+ "cfg-if",
 "chrono",
 "clap",
 "compute_api",
@@ -907,19 +939,24 @@ dependencies = [
 "opentelemetry",
 "postgres",
 "regex",
+ "remote_storage",
 "reqwest",
 "serde",
 "serde_json",
 "tar",
 "tokio",
 "tokio-postgres",
+ "tokio-util",
+ "toml_edit",
 "tracing",
 "tracing-opentelemetry",
 "tracing-subscriber",
 "tracing-utils",
 "url",
 "utils",
+ "vm_monitor",
 "workspace_hack",
+ "zstd",
 ]

 [[package]]
@@ -964,7 +1001,7 @@ dependencies = [
 "comfy-table",
 "compute_api",
 "git-version",
- "nix",
+ "nix 0.26.2",
 "once_cell",
 "pageserver_api",
 "postgres",
@@ -980,6 +1017,7 @@ dependencies = [
 "tar",
 "thiserror",
 "toml",
+ "tracing",
 "url",
 "utils",
 "workspace_hack",
@@ -1003,9 +1041,9 @@ checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"

 [[package]]
 name = "cpufeatures"
-version = "0.2.7"
+version = "0.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3e4c1eaa2012c47becbbad2ab175484c2a84d1185b566fb2cc5b8707343dfe58"
+checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1"
 dependencies = [
 "libc",
 ]
@@ -1169,7 +1207,7 @@ dependencies = [
 "proc-macro2",
 "quote",
 "strsim",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -1180,20 +1218,20 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a"
 dependencies = [
 "darling_core",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
 name = "dashmap"
-version = "5.4.0"
+version = "5.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc"
+checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d"
 dependencies = [
 "cfg-if",
- "hashbrown 0.12.3",
+ "hashbrown 0.14.0",
 "lock_api",
 "once_cell",
- "parking_lot_core 0.9.7",
+ "parking_lot_core 0.9.8",
 ]

 [[package]]
@@ -1245,7 +1283,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -1301,7 +1339,7 @@ dependencies = [
 "darling",
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -1497,7 +1535,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -1642,6 +1680,12 @@ dependencies = [
 "ahash",
 ]

+[[package]]
+name = "hashbrown"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
+
 [[package]]
 name = "hashlink"
 version = "0.8.2"
@@ -1842,8 +1886,8 @@ dependencies = [
 "hyper",
 "pin-project",
 "tokio",
- "tokio-tungstenite",
- "tungstenite",
+ "tokio-tungstenite 0.18.0",
+ "tungstenite 0.18.0",
 ]

 [[package]]
@@ -1907,6 +1951,19 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "inotify"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdd168d97690d0b8c412d6b6c10360277f4d7ee495c5d0d5d5fe0854923255cc"
+dependencies = [
+ "bitflags",
+ "futures-core",
+ "inotify-sys",
+ "libc",
+ "tokio",
+]
+
 [[package]]
 name = "inotify-sys"
 version = "0.1.5"
@@ -1972,6 +2029,15 @@ version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"

+[[package]]
+name = "jobserver"
+version = "0.1.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "js-sys"
 version = "0.3.63"
@@ -2057,9 +2123,9 @@ checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"

 [[package]]
 name = "lock_api"
-version = "0.4.9"
+version = "0.4.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df"
+checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16"
 dependencies = [
 "autocfg",
 "scopeguard",
@@ -2221,6 +2287,18 @@ dependencies = [
 "tempfile",
 ]

+[[package]]
+name = "nix"
+version = "0.25.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f346ff70e7dbfd675fe90590b92d59ef2de15a8779ae305ebcbfd3f0caf59be4"
+dependencies = [
+ "autocfg",
+ "bitflags",
+ "cfg-if",
+ "libc",
+]
+
 [[package]]
 name = "nix"
 version = "0.26.2"
@@ -2255,7 +2333,7 @@ dependencies = [
 "crossbeam-channel",
 "filetime",
 "fsevent-sys",
- "inotify",
+ "inotify 0.9.6",
 "kqueue",
 "libc",
 "mio",
@@ -2263,6 +2341,15 @@ dependencies = [
 "windows-sys 0.45.0",
 ]

+[[package]]
+name = "ntapi"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4"
+dependencies = [
+ "winapi",
+]
+
 [[package]]
 name = "num-bigint"
 version = "0.4.3"
@@ -2323,9 +2410,9 @@ dependencies = [

 [[package]]
 name = "once_cell"
-version = "1.17.1"
+version = "1.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
+checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"

 [[package]]
 name = "oorandom"
@@ -2356,7 +2443,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -2543,7 +2630,7 @@ dependencies = [
 "hyper",
 "itertools",
 "metrics",
- "nix",
+ "nix 0.26.2",
 "num-traits",
 "num_cpus",
 "once_cell",
@@ -2566,6 +2653,7 @@ dependencies = [
 "serde_json",
 "serde_with",
 "signal-hook",
+ "smallvec",
 "storage_broker",
 "strum",
 "strum_macros",
@@ -2624,7 +2712,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
 dependencies = [
 "lock_api",
- "parking_lot_core 0.9.7",
+ "parking_lot_core 0.9.8",
 ]

 [[package]]
@@ -2643,15 +2731,26 @@ dependencies = [

 [[package]]
 name = "parking_lot_core"
-version = "0.9.7"
+version = "0.9.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521"
+checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447"
 dependencies = [
 "cfg-if",
 "libc",
- "redox_syscall 0.2.16",
+ "redox_syscall 0.3.5",
 "smallvec",
- "windows-sys 0.45.0",
+ "windows-targets 0.48.0",
+]
+
+[[package]]
+name = "password-hash"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "346f04948ba92c43e8469c1ee6736c7563d71012b17d40745260fe106aac2166"
+dependencies = [
+ "base64ct",
+ "rand_core",
+ "subtle",
 ]

 [[package]]
@@ -2662,6 +2761,8 @@ checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
 dependencies = [
 "digest",
 "hmac",
+ "password-hash",
+ "sha2",
 ]

 [[package]]
@@ -2730,7 +2831,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -2927,7 +3028,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
 dependencies = [
 "proc-macro2",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -2938,9 +3039,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"

 [[package]]
 name = "proc-macro2"
-version = "1.0.64"
+version = "1.0.66"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78803b62cbf1f46fde80d7c0e803111524b9877184cfe7c3033659490ac7a7da"
+checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
 dependencies = [
 "unicode-ident",
 ]
@@ -3040,6 +3141,7 @@ dependencies = [
 "chrono",
 "clap",
 "consumption_metrics",
+ "dashmap",
 "futures",
 "git-version",
 "hashbrown 0.13.2",
@@ -3101,9 +3203,9 @@ dependencies = [

 [[package]]
 name = "quote"
-version = "1.0.27"
+version = "1.0.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500"
+checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965"
 dependencies = [
 "proc-macro2",
 ]
@@ -3237,6 +3339,7 @@ dependencies = [
 "metrics",
 "once_cell",
 "pin-project-lite",
+ "scopeguard",
 "serde",
 "serde_json",
 "tempfile",
@@ -3524,9 +3627,9 @@ dependencies = [

 [[package]]
 name = "rustls-webpki"
-version = "0.100.1"
+version = "0.100.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6207cd5ed3d8dca7816f8f3725513a34609c0c765bf652b8c3cb4cfd87db46b"
+checksum = "e98ff011474fa39949b7e5c0428f9b4937eda7da7848bbb947786b7be0b27dab"
 dependencies = [
 "ring",
 "untrusted",
@@ -3753,22 +3856,22 @@ dependencies = [

 [[package]]
 name = "serde"
-version = "1.0.163"
+version = "1.0.183"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2"
+checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c"
 dependencies = [
 "serde_derive",
 ]

 [[package]]
 name = "serde_derive"
-version = "1.0.163"
+version = "1.0.183"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e"
+checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -3782,6 +3885,16 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "serde_path_to_error"
+version = "0.1.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4beec8bce849d58d06238cb50db2e1c417cfeafa4c63f692b15c82b7c80f8335"
+dependencies = [
+ "itoa",
+ "serde",
+]
+
 [[package]]
 name = "serde_spanned"
 version = "0.6.2"
@@ -3828,7 +3941,7 @@ dependencies = [
 "darling",
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -3927,9 +4040,9 @@ dependencies = [

 [[package]]
 name = "smallvec"
-version = "1.10.0"
+version = "1.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
+checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"

 [[package]]
 name = "socket2"
@@ -4066,9 +4179,9 @@ dependencies = [

 [[package]]
 name = "syn"
-version = "2.0.16"
+version = "2.0.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6f671d4b5ffdb8eadec19c0ae67fe2639df8684bd7bc4b83d986b8db549cf01"
+checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -4094,14 +4207,29 @@ dependencies = [
 ]

 [[package]]
-name = "tar"
-version = "0.4.38"
+name = "sysinfo"
+version = "0.29.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b55807c0344e1e6c04d7c965f5289c39a8d94ae23ed5c0b57aabac549f871c6"
+checksum = "165d6d8539689e3d3bc8b98ac59541e1f21c7de7c85d60dc80e43ae0ed2113db"
+dependencies = [
+ "cfg-if",
+ "core-foundation-sys",
+ "libc",
+ "ntapi",
+ "once_cell",
+ "rayon",
+ "winapi",
+]
+
+[[package]]
+name = "tar"
+version = "0.4.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b16afcea1f22891c49a00c751c7b63b2233284064f11a200fc624137c51e2ddb"
 dependencies = [
 "filetime",
 "libc",
- "xattr 0.2.3",
+ "xattr",
 ]

 [[package]]
@@ -4183,7 +4311,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -4298,7 +4426,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -4392,7 +4520,7 @@ dependencies = [
 "redox_syscall 0.3.5",
 "tokio",
 "tokio-stream",
- "xattr 1.0.0",
+ "xattr",
 ]

 [[package]]
@@ -4404,7 +4532,19 @@ dependencies = [
 "futures-util",
 "log",
 "tokio",
- "tungstenite",
+ "tungstenite 0.18.0",
+]
+
+[[package]]
+name = "tokio-tungstenite"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b2dbec703c26b00d74844519606ef15d09a7d6857860f84ad223dec002ddea2"
+dependencies = [
+ "futures-util",
+ "log",
+ "tokio",
+ "tungstenite 0.20.0",
 ]

 [[package]]
@@ -4596,7 +4736,7 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -4725,6 +4865,25 @@ dependencies = [
 "utf-8",
 ]

+[[package]]
+name = "tungstenite"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e862a1c4128df0112ab625f55cd5c934bcb4312ba80b39ae4b4835a3fd58e649"
+dependencies = [
+ "byteorder",
+ "bytes",
+ "data-encoding",
+ "http",
+ "httparse",
+ "log",
+ "rand",
+ "sha1",
+ "thiserror",
+ "url",
+ "utf-8",
+]
+
 [[package]]
 name = "typenum"
 version = "1.16.0"
@@ -4852,7 +5011,7 @@ dependencies = [
 "hyper",
 "jsonwebtoken",
 "metrics",
- "nix",
+ "nix 0.26.2",
 "once_cell",
 "pin-project-lite",
 "pq_proto",
@@ -4870,6 +5029,7 @@ dependencies = [
 "thiserror",
 "tokio",
 "tokio-stream",
+ "tokio-util",
 "tracing",
 "tracing-error",
 "tracing-subscriber",
@@ -4906,6 +5066,28 @@ version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"

+[[package]]
+name = "vm_monitor"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "axum",
+ "cgroups-rs",
+ "clap",
+ "futures",
+ "inotify 0.10.2",
+ "serde",
+ "serde_json",
+ "sysinfo",
+ "tokio",
+ "tokio-postgres",
+ "tokio-stream",
+ "tokio-util",
+ "tracing",
+ "tracing-subscriber",
+ "workspace_hack",
+]
+
 [[package]]
 name = "vsimd"
 version = "0.8.0"
@@ -4976,7 +5158,7 @@ dependencies = [
 "once_cell",
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 "wasm-bindgen-shared",
 ]

@@ -5010,7 +5192,7 @@ checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 "wasm-bindgen-backend",
 "wasm-bindgen-shared",
 ]
@@ -5295,11 +5477,14 @@ name = "workspace_hack"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "axum",
 "bytes",
+ "cc",
 "chrono",
 "clap",
 "clap_builder",
 "crossbeam-utils",
+ "digest",
 "either",
 "fail",
 "futures",
@@ -5308,6 +5493,7 @@ dependencies = [
 "futures-executor",
 "futures-sink",
 "futures-util",
+ "hyper",
 "itertools",
 "libc",
 "log",
@@ -5326,9 +5512,10 @@ dependencies = [
 "scopeguard",
 "serde",
 "serde_json",
+ "smallvec",
 "socket2 0.4.9",
 "syn 1.0.109",
- "syn 2.0.16",
+ "syn 2.0.28",
 "tokio",
 "tokio-rustls 0.23.4",
 "tokio-util",
@@ -5337,7 +5524,6 @@ dependencies = [
 "tower",
 "tracing",
 "tracing-core",
- "tracing-subscriber",
 "url",
 ]

@@ -5358,15 +5544,6 @@ dependencies = [
 "time",
 ]

-[[package]]
-name = "xattr"
-version = "0.2.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6d1526bbe5aaeb5eb06885f4d987bcdfa5e23187055de9b83fe00156a821fabc"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "xattr"
 version = "1.0.0"
@@ -5396,3 +5573,33 @@ name = "zeroize"
 version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
+
+[[package]]
+name = "zstd"
+version = "0.12.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c"
+dependencies = [
+ "zstd-safe",
+]
+
+[[package]]
+name = "zstd-safe"
+version = "6.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581"
+dependencies = [
+ "libc",
+ "zstd-sys",
+]
+
+[[package]]
+name = "zstd-sys"
+version = "2.0.8+zstd.1.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,6 +23,7 @@ members = [
    "libs/remote_storage",
    "libs/tracing-utils",
    "libs/postgres_ffi/wal_craft",
+    "libs/vm_monitor",
 ]

 [workspace.package]
@@ -41,12 +42,14 @@ aws-sdk-s3 = "0.27"
 aws-smithy-http = "0.55"
 aws-credential-types = "0.55"
 aws-types = "0.55"
+axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
 bindgen = "0.65"
 bstr = "1.0"
 byteorder = "1.4"
 bytes = "1.0"
+cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["derive"] }
 close_fds = "0.3.2"
@@ -54,6 +57,7 @@ comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
 crossbeam-utils = "0.8.5"
+dashmap = "5.5.0"
 either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
@@ -73,6 +77,7 @@ humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
 hyper-tungstenite = "0.9"
+inotify = "0.10.2"
 itertools = "0.10"
 jsonwebtoken = "8"
 libc = "0.2"
@@ -88,7 +93,7 @@ opentelemetry = "0.19.0"
 opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.11.0"
 parking_lot = "0.12"
-pbkdf2 = "0.12.1"
+pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
@@ -104,12 +109,14 @@ rustls = "0.20"
 rustls-pemfile = "1"
 rustls-split = "0.3"
 scopeguard = "1.1"
+sysinfo = "0.29.2"
 sentry = { version = "0.30", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_with = "2.0"
 sha2 = "0.10.2"
 signal-hook = "0.3"
+smallvec = "1.11"
 socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
@@ -132,7 +139,7 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.19.0"
-tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] }
+tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 url = "2.2"
 uuid = { version = "1.2", features = ["v4", "serde"] }
 walkdir = "2.3.2"
@@ -168,6 +175,7 @@ storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main br
 tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
+vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }

 ## Common library dependency
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }
--- a/2
+++ b/2
@@ -51,6 +51,7 @@ RUN set -e \
      --bin safekeeper  \
      --bin storage_broker  \
      --bin proxy  \
+      --bin neon_local \
      --locked --release \
    && cachepot -s

@@ -76,6 +77,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker         /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local               /usr/local/bin

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -551,10 +551,8 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-# eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703 made on 15/07/2023
-# There is no release tag yet
-RUN wget https://github.com/neondatabase/pg_embedding/archive/eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703.tar.gz -O pg_embedding.tar.gz && \
-    echo "030846df723652f99a8689ce63b66fa0c23477a7fd723533ab8a6b28ab70730f pg_embedding.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.tar.gz -O pg_embedding.tar.gz && \
+    echo "0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 pg_embedding.tar.gz" | sha256sum --check && \
    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -766,29 +764,6 @@ RUN rm -r /usr/local/pgsql/include
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a

-#########################################################################################
-#
-# Extenstion only
-#
-#########################################################################################
-FROM python:3.9-slim-bullseye AS generate-ext-index
-ARG PG_VERSION
-ARG BUILD_TAG
-RUN apt update && apt install -y zstd
-
-# copy the control files here
-COPY --from=kq-imcx-pg-build /extensions/ /extensions/
-COPY --from=pg-anon-pg-build /extensions/ /extensions/
-COPY --from=postgis-build /extensions/ /extensions/
-COPY scripts/combine_control_files.py ./combine_control_files.py
-RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG} --public_extensions="anon,postgis"
-
-FROM scratch AS postgres-extensions
-# After the transition this layer will include all extensitons.
-# As for now, it's only a couple for testing purposses
-COPY --from=generate-ext-index /extensions/*.tar.zst /extensions/
-COPY --from=generate-ext-index /ext_index.json /ext_index.json
-
 #########################################################################################
 #
 # Final layer
@@ -818,6 +793,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libxml2, libxslt1.1 for xml2
 # libzstd1 for zstd
 # libboost*, libfreetype6, and zlib1g for rdkit
+# ca-certificates for communicating with s3 by compute_ctl
 RUN apt update &&  \
    apt install --no-install-recommends -y \
        gdb \
@@ -841,7 +817,8 @@ RUN apt update &&  \
        libcurl4-openssl-dev \
        locales \
        procps \
-        zlib1g && \
+        zlib1g \
+        ca-certificates && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

--- a/2
+++ b/2
@@ -108,6 +108,8 @@ postgres-%: postgres-configure-% \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
 	+@echo "Compiling pageinspect $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
+	+@echo "Compiling amcheck $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install

 .PHONY: postgres-clean-%
 postgres-clean-%:
--- a/README.md
+++ b/README.md
@@ -29,13 +29,13 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
 libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
-libcurl4-openssl-dev
+libcurl4-openssl-dev openssl python-poetry
 ```
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
-  protobuf-devel libcurl-devel
+  protobuf-devel libcurl-devel openssl poetry
 ```
 * On Arch based systems, these packages are needed:
 ```bash
@@ -235,6 +235,13 @@ CARGO_BUILD_FLAGS="--features=testing" make
 ./scripts/pytest
 ```

+By default, this runs both debug and release modes, and all supported postgres versions. When
+testing locally, it is convenient to run just run one set of permutations, like this:
+
+```sh
+DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
+```
+
 ## Documentation

 [docs](/docs) Contains a top-level overview of all available markdown documentation.
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -8,6 +8,7 @@ license.workspace = true
 anyhow.workspace = true
 async-compression.workspace = true
 chrono.workspace = true
+cfg-if.workspace = true
 clap.workspace = true
 flate2.workspace = true
 futures.workspace = true
@@ -23,6 +24,7 @@ tar.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
+tokio-util.workspace = true
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
@@ -32,3 +34,7 @@ url.workspace = true
 compute_api.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
+toml_edit.workspace = true
+remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
+vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
+zstd = "0.12.4"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -5,6 +5,8 @@
 //! - `compute_ctl` accepts cluster (compute node) specification as a JSON file.
 //! - Every start is a fresh start, so the data directory is removed and
 //!   initialized again on each run.
+//! - If remote_extension_config is provided, it will be used to fetch extensions list
+//!  and download `shared_preload_libraries` from the remote storage.
 //! - Next it will put configuration files into the `PGDATA` directory.
 //! - Sync safekeepers and get commit LSN.
 //! - Get `basebackup` from pageserver using the returned on the previous step LSN.
@@ -27,15 +29,15 @@
 //! compute_ctl -D /var/db/postgres/compute \
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
 //!             -S /var/db/postgres/specs/current.json \
-//!             -b /usr/local/bin/postgres
+//!             -b /usr/local/bin/postgres \
+//!             -r {"bucket": "neon-dev-extensions-eu-central-1", "region": "eu-central-1"}
 //! ```
 //!
 use std::collections::HashMap;
 use std::fs::File;
-use std::panic;
 use std::path::Path;
 use std::process::exit;
-use std::sync::{mpsc, Arc, Condvar, Mutex};
+use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
 use std::{thread, time::Duration};

 use anyhow::{Context, Result};
@@ -48,22 +50,33 @@ use compute_api::responses::ComputeStatus;

 use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_tools::configurator::launch_configurator;
+use compute_tools::extension_server::{get_pg_version, init_remote_storage};
 use compute_tools::http::api::launch_http_server;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;

-const BUILD_TAG_DEFAULT: &str = "local";
+// this is an arbitrary build tag. Fine as a default / for testing purposes
+// in-case of not-set environment var
+const BUILD_TAG_DEFAULT: &str = "5670669815";

 fn main() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;

-    let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT);
-
+    let build_tag = option_env!("BUILD_TAG")
+        .unwrap_or(BUILD_TAG_DEFAULT)
+        .to_string();
    info!("build_tag: {build_tag}");

    let matches = cli().get_matches();
+    let pgbin_default = String::from("postgres");
+    let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
+
+    let remote_ext_config = matches.get_one::<String>("remote-ext-config");
+    let ext_remote_storage = remote_ext_config.map(|x| {
+        init_remote_storage(x).expect("cannot initialize remote extension storage from config")
+    });

    let http_port = *matches
        .get_one::<u16>("http-port")
@@ -128,14 +141,12 @@ fn main() -> Result<()> {
    let compute_id = matches.get_one::<String>("compute-id");
    let control_plane_uri = matches.get_one::<String>("control-plane-uri");

-    // Try to use just 'postgres' if no path is provided
-    let pgbin = matches.get_one::<String>("pgbin").unwrap();
-
    let spec;
    let mut live_config_allowed = false;
    match spec_json {
        // First, try to get cluster spec from the cli argument
        Some(json) => {
+            info!("got spec from cli argument {}", json);
            spec = Some(serde_json::from_str(json)?);
        }
        None => {
@@ -168,8 +179,10 @@ fn main() -> Result<()> {

    let mut new_state = ComputeState::new();
    let spec_set;
+
    if let Some(spec) = spec {
        let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
+        info!("new pspec.spec: {:?}", pspec.spec);
        new_state.pspec = Some(pspec);
        spec_set = true;
    } else {
@@ -179,20 +192,35 @@ fn main() -> Result<()> {
        connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
        pgdata: pgdata.to_string(),
        pgbin: pgbin.to_string(),
+        pgversion: get_pg_version(pgbin),
        live_config_allowed,
        state: Mutex::new(new_state),
        state_changed: Condvar::new(),
+        ext_remote_storage,
+        ext_download_progress: RwLock::new(HashMap::new()),
+        build_tag,
    };
    let compute = Arc::new(compute_node);

+    // If this is a pooled VM, prewarm before starting HTTP server and becoming
+    // available for binding. Prewarming helps postgres start quicker later,
+    // because QEMU will already have it's memory allocated from the host, and
+    // the necessary binaries will alreaady be cached.
+    if !spec_set {
+        compute.prewarm_postgres()?;
+    }
+
    // Launch http service first, so we were able to serve control-plane
    // requests, while configuration is still in progress.
    let _http_handle =
        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");

+    let extension_server_port: u16 = http_port;
+
    if !spec_set {
        // No spec provided, hang waiting for it.
        info!("no compute spec provided, waiting");
+
        let mut state = compute.state.lock().unwrap();
        while state.status != ComputeStatus::ConfigurationPending {
            state = compute.state_changed.wait(state).unwrap();
@@ -229,7 +257,7 @@ fn main() -> Result<()> {
    // Start Postgres
    let mut delay_exit = false;
    let mut exit_code = None;
-    let pg = match compute.start_compute() {
+    let pg = match compute.start_compute(extension_server_port) {
        Ok(pg) => Some(pg),
        Err(err) => {
            error!("could not start the compute node: {:?}", err);
@@ -242,6 +270,55 @@ fn main() -> Result<()> {
        }
    };

+    // Start the vm-monitor if directed to. The vm-monitor only runs on linux
+    // because it requires cgroups.
+    cfg_if::cfg_if! {
+        if #[cfg(target_os = "linux")] {
+            use std::env;
+            use tokio_util::sync::CancellationToken;
+            use tracing::warn;
+            let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
+            let cgroup = matches.get_one::<String>("filecache-connstr");
+            let file_cache_connstr = matches.get_one::<String>("cgroup");
+
+            // Only make a runtime if we need to.
+            // Note: it seems like you can make a runtime in an inner scope and
+            // if you start a task in it it won't be dropped. However, make it
+            // in the outermost scope just to be safe.
+            let rt = match (env::var_os("AUTOSCALING"), vm_monitor_addr) {
+                (None, None) => None,
+                (None, Some(_)) => {
+                    warn!("--vm-monitor-addr option set but AUTOSCALING env var not present");
+                    None
+                }
+                (Some(_), None) => {
+                    panic!("AUTOSCALING env var present but --vm-monitor-addr option not set")
+                }
+                (Some(_), Some(_)) => Some(
+                    tokio::runtime::Builder::new_multi_thread()
+                        .worker_threads(4)
+                        .enable_all()
+                        .build()
+                        .expect("failed to create tokio runtime for monitor"),
+                ),
+            };
+
+            // This token is used internally by the monitor to clean up all threads
+            let token = CancellationToken::new();
+
+            let vm_monitor = &rt.as_ref().map(|rt| {
+                rt.spawn(vm_monitor::start(
+                    Box::leak(Box::new(vm_monitor::Args {
+                        cgroup: cgroup.cloned(),
+                        pgconnstr: file_cache_connstr.cloned(),
+                        addr: vm_monitor_addr.cloned().unwrap(),
+                    })),
+                    token.clone(),
+                ))
+            });
+        }
+    }
+
    // Wait for the child Postgres process forever. In this state Ctrl+C will
    // propagate to Postgres and it will be shut down as well.
    if let Some(mut pg) = pg {
@@ -255,6 +332,24 @@ fn main() -> Result<()> {
        exit_code = ecode.code()
    }

+    // Terminate the vm_monitor so it releases the file watcher on
+    // /sys/fs/cgroup/neon-postgres.
+    // Note: the vm-monitor only runs on linux because it requires cgroups.
+    cfg_if::cfg_if! {
+        if #[cfg(target_os = "linux")] {
+            if let Some(handle) = vm_monitor {
+                // Kills all threads spawned by the monitor
+                token.cancel();
+                // Kills the actual task running the monitor
+                handle.abort();
+
+                // If handle is some, rt must have been used to produce it, and
+                // hence is also some
+                rt.unwrap().shutdown_timeout(Duration::from_secs(2));
+            }
+        }
+    }
+
    // Maybe sync safekeepers again, to speed up next startup
    let compute_state = compute.state.lock().unwrap().clone();
    let pspec = compute_state.pspec.as_ref().expect("spec must be set");
@@ -358,6 +453,35 @@ fn cli() -> clap::Command {
                .long("control-plane-uri")
                .value_name("CONTROL_PLANE_API_BASE_URI"),
        )
+        .arg(
+            Arg::new("remote-ext-config")
+                .short('r')
+                .long("remote-ext-config")
+                .value_name("REMOTE_EXT_CONFIG"),
+        )
+        // TODO(fprasx): we currently have default arguments because the cloud PR
+        // to pass them in hasn't been merged yet. We should get rid of them once
+        // the PR is merged.
+        .arg(
+            Arg::new("vm-monitor-addr")
+                .long("vm-monitor-addr")
+                .default_value("0.0.0.0:10301")
+                .value_name("VM_MONITOR_ADDR"),
+        )
+        .arg(
+            Arg::new("cgroup")
+                .long("cgroup")
+                .default_value("neon-postgres")
+                .value_name("CGROUP"),
+        )
+        .arg(
+            Arg::new("filecache-connstr")
+                .long("filecache-connstr")
+                .default_value(
+                    "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable",
+                )
+                .value_name("FILECACHE_CONNSTR"),
+        )
 }

 #[test]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,16 +1,21 @@
+use std::collections::HashMap;
+use std::env;
 use std::fs;
 use std::io::BufRead;
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
-use std::sync::{Condvar, Mutex};
+use std::sync::{Condvar, Mutex, RwLock};
+use std::time::Instant;

 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
+use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use postgres::{Client, NoTls};
+use tokio;
 use tokio_postgres;
 use tracing::{error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
@@ -20,10 +25,12 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
 use compute_api::spec::{ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;

-use crate::config;
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+
 use crate::pg_helpers::*;
 use crate::spec::*;
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
+use crate::{config, extension_server};

 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
@@ -31,6 +38,7 @@ pub struct ComputeNode {
    pub connstr: url::Url,
    pub pgdata: String,
    pub pgbin: String,
+    pub pgversion: String,
    /// We should only allow live re- / configuration of the compute node if
    /// it uses 'pull model', i.e. it can go to control-plane and fetch
    /// the latest configuration. Otherwise, there could be a case:
@@ -50,6 +58,19 @@ pub struct ComputeNode {
    pub state: Mutex<ComputeState>,
    /// `Condvar` to allow notifying waiters about state changes.
    pub state_changed: Condvar,
+    ///  the S3 bucket that we search for extensions in
+    pub ext_remote_storage: Option<GenericRemoteStorage>,
+    // key: ext_archive_name, value: started download time, download_completed?
+    pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
+    pub build_tag: String,
+}
+
+// store some metrics about download size that might impact startup time
+#[derive(Clone, Debug)]
+pub struct RemoteExtensionMetrics {
+    num_ext_downloaded: u64,
+    largest_ext_size: u64,
+    total_ext_download_size: u64,
 }

 #[derive(Clone, Debug)]
@@ -155,6 +176,27 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
    }
 }

+/// If we are a VM, returns a [`Command`] that will run in the `neon-postgres`
+/// cgroup. Otherwise returns the default `Command::new(cmd)`
+///
+/// This function should be used to start postgres, as it will start it in the
+/// neon-postgres cgroup if we are a VM. This allows autoscaling to control
+/// postgres' resource usage. The cgroup will exist in VMs because vm-builder
+/// creates it during the sysinit phase of its inittab.
+fn maybe_cgexec(cmd: &str) -> Command {
+    // The cplane sets this env var for autoscaling computes.
+    // use `var_os` so we don't have to worry about the variable being valid
+    // unicode. Should never be an concern . . . but just in case
+    if env::var_os("AUTOSCALING").is_some() {
+        let mut command = Command::new("cgexec");
+        command.args(["-g", "memory:neon-postgres"]);
+        command.arg(cmd);
+        command
+    } else {
+        Command::new(cmd)
+    }
+}
+
 /// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
 /// that we give to customers
 fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
@@ -260,7 +302,7 @@ impl ComputeNode {
    #[instrument(skip_all, fields(%lsn))]
    fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
        let spec = compute_state.pspec.as_ref().expect("spec must be set");
-        let start_time = Utc::now();
+        let start_time = Instant::now();

        let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?;

@@ -273,7 +315,10 @@ impl ComputeNode {
            info!("Storage auth token not set");
        }

+        // Connect to pageserver
        let mut client = config.connect(NoTls)?;
+        let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
+
        let basebackup_cmd = match lsn {
            // HACK We don't use compression on first start (Lsn(0)) because there's no API for it
            Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id),
@@ -319,13 +364,10 @@ impl ComputeNode {
        };

        // Report metrics
-        self.state.lock().unwrap().metrics.basebackup_bytes =
-            measured_reader.get_byte_count() as u64;
-        self.state.lock().unwrap().metrics.basebackup_ms = Utc::now()
-            .signed_duration_since(start_time)
-            .to_std()
-            .unwrap()
-            .as_millis() as u64;
+        let mut state = self.state.lock().unwrap();
+        state.metrics.pageserver_connect_micros = pageserver_connect_micros;
+        state.metrics.basebackup_bytes = measured_reader.get_byte_count() as u64;
+        state.metrics.basebackup_ms = start_time.elapsed().as_millis() as u64;
        Ok(())
    }

@@ -431,7 +473,7 @@ impl ComputeNode {
    pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
        let start_time = Utc::now();

-        let sync_handle = Command::new(&self.pgbin)
+        let sync_handle = maybe_cgexec(&self.pgbin)
            .args(["--sync-safekeepers"])
            .env("PGDATA", &self.pgdata) // we cannot use -D in this mode
            .envs(if let Some(storage_auth_token) = &storage_auth_token {
@@ -473,14 +515,22 @@ impl ComputeNode {
    /// Do all the preparations like PGDATA directory creation, configuration,
    /// safekeepers sync, basebackup, etc.
    #[instrument(skip_all)]
-    pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
+    pub fn prepare_pgdata(
+        &self,
+        compute_state: &ComputeState,
+        extension_server_port: u16,
+    ) -> Result<()> {
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        let spec = &pspec.spec;
        let pgdata_path = Path::new(&self.pgdata);

        // Remove/create an empty pgdata directory and put configuration there.
        self.create_pgdata()?;
-        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?;
+        config::write_postgres_conf(
+            &pgdata_path.join("postgresql.conf"),
+            &pspec.spec,
+            Some(extension_server_port),
+        )?;

        // Syncing safekeepers is only safe with primary nodes: if a primary
        // is already connected it will be kicked out, so a secondary (standby)
@@ -532,6 +582,50 @@ impl ComputeNode {
        Ok(())
    }

+    /// Start and stop a postgres process to warm up the VM for startup.
+    pub fn prewarm_postgres(&self) -> Result<()> {
+        info!("prewarming");
+
+        // Create pgdata
+        let pgdata = &format!("{}.warmup", self.pgdata);
+        create_pgdata(pgdata)?;
+
+        // Run initdb to completion
+        info!("running initdb");
+        let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
+        Command::new(initdb_bin)
+            .args(["-D", pgdata])
+            .output()
+            .expect("cannot start initdb process");
+
+        // Write conf
+        use std::io::Write;
+        let conf_path = Path::new(pgdata).join("postgresql.conf");
+        let mut file = std::fs::File::create(conf_path)?;
+        writeln!(file, "shared_buffers=65536")?;
+        writeln!(file, "port=51055")?; // Nobody should be connecting
+        writeln!(file, "shared_preload_libraries = 'neon'")?;
+
+        // Start postgres
+        info!("starting postgres");
+        let mut pg = maybe_cgexec(&self.pgbin)
+            .args(["-D", pgdata])
+            .spawn()
+            .expect("cannot start postgres process");
+
+        // Stop it when it's ready
+        info!("waiting for postgres");
+        wait_for_postgres(&mut pg, Path::new(pgdata))?;
+        pg.kill()?;
+        info!("sent kill signal");
+        pg.wait()?;
+        info!("done prewarming");
+
+        // clean up
+        let _ok = fs::remove_dir_all(pgdata);
+        Ok(())
+    }
+
    /// Start Postgres as a child process and manage DBs/roles.
    /// After that this will hang waiting on the postmaster process to exit.
    #[instrument(skip_all)]
@@ -542,7 +636,7 @@ impl ComputeNode {
        let pgdata_path = Path::new(&self.pgdata);

        // Run postgres as a child process.
-        let mut pg = Command::new(&self.pgbin)
+        let mut pg = maybe_cgexec(&self.pgbin)
            .args(["-D", &self.pgdata])
            .envs(if let Some(storage_auth_token) = &storage_auth_token {
                vec![("NEON_AUTH_TOKEN", storage_auth_token)]
@@ -626,7 +720,7 @@ impl ComputeNode {

        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
-        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec)?;
+        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;

        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
        self.pg_reload_conf(&mut client)?;
@@ -656,7 +750,7 @@ impl ComputeNode {
    }

    #[instrument(skip_all)]
-    pub fn start_compute(&self) -> Result<std::process::Child> {
+    pub fn start_compute(&self, extension_server_port: u16) -> Result<std::process::Child> {
        let compute_state = self.state.lock().unwrap().clone();
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        info!(
@@ -667,7 +761,38 @@ impl ComputeNode {
            pspec.timeline_id,
        );

-        self.prepare_pgdata(&compute_state)?;
+        info!(
+            "start_compute spec.remote_extensions {:?}",
+            pspec.spec.remote_extensions
+        );
+
+        // This part is sync, because we need to download
+        // remote shared_preload_libraries before postgres start (if any)
+        if let Some(remote_extensions) = &pspec.spec.remote_extensions {
+            // First, create control files for all availale extensions
+            extension_server::create_control_files(remote_extensions, &self.pgbin);
+
+            let library_load_start_time = Utc::now();
+            let remote_ext_metrics = self.prepare_preload_libraries(&pspec.spec)?;
+
+            let library_load_time = Utc::now()
+                .signed_duration_since(library_load_start_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64;
+            let mut state = self.state.lock().unwrap();
+            state.metrics.load_ext_ms = library_load_time;
+            state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded;
+            state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size;
+            state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size;
+            info!(
+                "Loading shared_preload_libraries took {:?}ms",
+                library_load_time
+            );
+            info!("{:?}", remote_ext_metrics);
+        }
+
+        self.prepare_pgdata(&compute_state, extension_server_port)?;

        let start_time = Utc::now();
        let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
@@ -815,4 +940,172 @@ LIMIT 100",
            "{{\"pg_stat_statements\": []}}".to_string()
        }
    }
+
+    // download an archive, unzip and place files in correct locations
+    pub async fn download_extension(
+        &self,
+        real_ext_name: String,
+        ext_path: RemotePath,
+    ) -> Result<u64, DownloadError> {
+        let remote_storage = self
+            .ext_remote_storage
+            .as_ref()
+            .ok_or(DownloadError::BadInput(anyhow::anyhow!(
+                "Remote extensions storage is not configured",
+            )))?;
+
+        let ext_archive_name = ext_path.object_name().expect("bad path");
+
+        let mut first_try = false;
+        if !self
+            .ext_download_progress
+            .read()
+            .expect("lock err")
+            .contains_key(ext_archive_name)
+        {
+            self.ext_download_progress
+                .write()
+                .expect("lock err")
+                .insert(ext_archive_name.to_string(), (Utc::now(), false));
+            first_try = true;
+        }
+        let (download_start, download_completed) =
+            self.ext_download_progress.read().expect("lock err")[ext_archive_name];
+        let start_time_delta = Utc::now()
+            .signed_duration_since(download_start)
+            .to_std()
+            .unwrap()
+            .as_millis() as u64;
+
+        // how long to wait for extension download if it was started by another process
+        const HANG_TIMEOUT: u64 = 3000; // milliseconds
+
+        if download_completed {
+            info!("extension already downloaded, skipping re-download");
+            return Ok(0);
+        } else if start_time_delta < HANG_TIMEOUT && !first_try {
+            info!("download {ext_archive_name} already started by another process, hanging untill completion or timeout");
+            let mut interval = tokio::time::interval(tokio::time::Duration::from_millis(500));
+            loop {
+                info!("waiting for download");
+                interval.tick().await;
+                let (_, download_completed_now) =
+                    self.ext_download_progress.read().expect("lock")[ext_archive_name];
+                if download_completed_now {
+                    info!("download finished by whoever else downloaded it");
+                    return Ok(0);
+                }
+            }
+            // NOTE: the above loop will get terminated
+            // based on the timeout of the download function
+        }
+
+        // if extension hasn't been downloaded before or the previous
+        // attempt to download was at least HANG_TIMEOUT ms ago
+        // then we try to download it here
+        info!("downloading new extension {ext_archive_name}");
+
+        let download_size = extension_server::download_extension(
+            &real_ext_name,
+            &ext_path,
+            remote_storage,
+            &self.pgbin,
+        )
+        .await
+        .map_err(DownloadError::Other);
+
+        self.ext_download_progress
+            .write()
+            .expect("bad lock")
+            .insert(ext_archive_name.to_string(), (download_start, true));
+
+        download_size
+    }
+
+    #[tokio::main]
+    pub async fn prepare_preload_libraries(
+        &self,
+        spec: &ComputeSpec,
+    ) -> Result<RemoteExtensionMetrics> {
+        if self.ext_remote_storage.is_none() {
+            return Ok(RemoteExtensionMetrics {
+                num_ext_downloaded: 0,
+                largest_ext_size: 0,
+                total_ext_download_size: 0,
+            });
+        }
+        let remote_extensions = spec
+            .remote_extensions
+            .as_ref()
+            .ok_or(anyhow::anyhow!("Remote extensions are not configured",))?;
+
+        info!("parse shared_preload_libraries from spec.cluster.settings");
+        let mut libs_vec = Vec::new();
+        if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
+            libs_vec = libs
+                .split(&[',', '\'', ' '])
+                .filter(|s| *s != "neon" && !s.is_empty())
+                .map(str::to_string)
+                .collect();
+        }
+        info!("parse shared_preload_libraries from provided postgresql.conf");
+
+        // that is used in neon_local and python tests
+        if let Some(conf) = &spec.cluster.postgresql_conf {
+            let conf_lines = conf.split('\n').collect::<Vec<&str>>();
+            let mut shared_preload_libraries_line = "";
+            for line in conf_lines {
+                if line.starts_with("shared_preload_libraries") {
+                    shared_preload_libraries_line = line;
+                }
+            }
+            let mut preload_libs_vec = Vec::new();
+            if let Some(libs) = shared_preload_libraries_line.split("='").nth(1) {
+                preload_libs_vec = libs
+                    .split(&[',', '\'', ' '])
+                    .filter(|s| *s != "neon" && !s.is_empty())
+                    .map(str::to_string)
+                    .collect();
+            }
+            libs_vec.extend(preload_libs_vec);
+        }
+
+        // Don't try to download libraries that are not in the index.
+        // Assume that they are already present locally.
+        libs_vec.retain(|lib| remote_extensions.library_index.contains_key(lib));
+
+        info!("Downloading to shared preload libraries: {:?}", &libs_vec);
+
+        let mut download_tasks = Vec::new();
+        for library in &libs_vec {
+            let (ext_name, ext_path) = remote_extensions.get_ext(library, true)?;
+            download_tasks.push(self.download_extension(ext_name, ext_path));
+        }
+        let results = join_all(download_tasks).await;
+
+        let mut remote_ext_metrics = RemoteExtensionMetrics {
+            num_ext_downloaded: 0,
+            largest_ext_size: 0,
+            total_ext_download_size: 0,
+        };
+        for result in results {
+            let download_size = match result {
+                Ok(res) => {
+                    remote_ext_metrics.num_ext_downloaded += 1;
+                    res
+                }
+                Err(err) => {
+                    // if we failed to download an extension, we don't want to fail the whole
+                    // process, but we do want to log the error
+                    error!("Failed to download extension: {}", err);
+                    0
+                }
+            };
+
+            remote_ext_metrics.largest_ext_size =
+                std::cmp::max(remote_ext_metrics.largest_ext_size, download_size);
+            remote_ext_metrics.total_ext_download_size += download_size;
+        }
+        Ok(remote_ext_metrics)
+    }
 }
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -33,7 +33,11 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
 }

 /// Create or completely rewrite configuration file specified by `path`
-pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
+pub fn write_postgres_conf(
+    path: &Path,
+    spec: &ComputeSpec,
+    extension_server_port: Option<u16>,
+) -> Result<()> {
    // File::create() destroys the file content if it exists.
    let mut file = File::create(path)?;

@@ -87,5 +91,9 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
        writeln!(file, "# Managed by compute_ctl: end")?;
    }

+    if let Some(port) = extension_server_port {
+        writeln!(file, "neon.extension_server_port={}", port)?;
+    }
+
    Ok(())
 }
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -0,0 +1,221 @@
+// Download extension files from the extension store
+// and put them in the right place in the postgres directory (share / lib)
+/*
+The layout of the S3 bucket is as follows:
+5615610098 // this is an extension build number
+├── v14
+│   ├── extensions
+│   │   ├── anon.tar.zst
+│   │   └── embedding.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   ├── anon.tar.zst
+    │   └── embedding.tar.zst
+    └── ext_index.json
+5615261079
+├── v14
+│   ├── extensions
+│   │   └── anon.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   └── anon.tar.zst
+    └── ext_index.json
+5623261088
+├── v14
+│   ├── extensions
+│   │   └── embedding.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   └── embedding.tar.zst
+    └── ext_index.json
+
+Note that build number cannot be part of prefix because we might need extensions
+from other build numbers.
+
+ext_index.json stores the control files and location of extension archives
+It also stores a list of public extensions and a library_index
+
+We don't need to duplicate extension.tar.zst files.
+We only need to upload a new one if it is updated.
+(Although currently we just upload every time anyways, hopefully will change
+this sometime)
+
+*access* is controlled by spec
+
+More specifically, here is an example ext_index.json
+{
+    "public_extensions": [
+        "anon",
+        "pg_buffercache"
+    ],
+    "library_index": {
+        "anon": "anon",
+        "pg_buffercache": "pg_buffercache"
+    },
+    "extension_data": {
+        "pg_buffercache": {
+            "control_data": {
+                "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
+            },
+            "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
+        },
+        "anon": {
+            "control_data": {
+                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
+            },
+            "archive_path": "5670669815/v14/extensions/anon.tar.zst"
+        }
+    }
+}
+*/
+use anyhow::Context;
+use anyhow::{self, Result};
+use compute_api::spec::RemoteExtSpec;
+use remote_storage::*;
+use serde_json;
+use std::io::Read;
+use std::num::{NonZeroU32, NonZeroUsize};
+use std::path::Path;
+use std::str;
+use tar::Archive;
+use tokio::io::AsyncReadExt;
+use tracing::info;
+use tracing::log::warn;
+use zstd::stream::read::Decoder;
+
+fn get_pg_config(argument: &str, pgbin: &str) -> String {
+    // gives the result of `pg_config [argument]`
+    // where argument is a flag like `--version` or `--sharedir`
+    let pgconfig = pgbin
+        .strip_suffix("postgres")
+        .expect("bad pgbin")
+        .to_owned()
+        + "/pg_config";
+    let config_output = std::process::Command::new(pgconfig)
+        .arg(argument)
+        .output()
+        .expect("pg_config error");
+    std::str::from_utf8(&config_output.stdout)
+        .expect("pg_config error")
+        .trim()
+        .to_string()
+}
+
+pub fn get_pg_version(pgbin: &str) -> String {
+    // pg_config --version returns a (platform specific) human readable string
+    // such as "PostgreSQL 15.4". We parse this to v14/v15
+    let human_version = get_pg_config("--version", pgbin);
+    if human_version.contains("15") {
+        return "v15".to_string();
+    } else if human_version.contains("14") {
+        return "v14".to_string();
+    }
+    panic!("Unsuported postgres version {human_version}");
+}
+
+// download the archive for a given extension,
+// unzip it, and place files in the appropriate locations (share/lib)
+pub async fn download_extension(
+    ext_name: &str,
+    ext_path: &RemotePath,
+    remote_storage: &GenericRemoteStorage,
+    pgbin: &str,
+) -> Result<u64> {
+    info!("Download extension {:?} from {:?}", ext_name, ext_path);
+    let mut download = remote_storage.download(ext_path).await?;
+    let mut download_buffer = Vec::new();
+    download
+        .download_stream
+        .read_to_end(&mut download_buffer)
+        .await?;
+    let download_size = download_buffer.len() as u64;
+    // it's unclear whether it is more performant to decompress into memory or not
+    // TODO: decompressing into memory can be avoided
+    let mut decoder = Decoder::new(download_buffer.as_slice())?;
+    let mut decompress_buffer = Vec::new();
+    decoder.read_to_end(&mut decompress_buffer)?;
+    let mut archive = Archive::new(decompress_buffer.as_slice());
+    let unzip_dest = pgbin
+        .strip_suffix("/bin/postgres")
+        .expect("bad pgbin")
+        .to_string()
+        + "/download_extensions";
+    archive.unpack(&unzip_dest)?;
+    info!("Download + unzip {:?} completed successfully", &ext_path);
+
+    let sharedir_paths = (
+        unzip_dest.to_string() + "/share/extension",
+        Path::new(&get_pg_config("--sharedir", pgbin)).join("extension"),
+    );
+    let libdir_paths = (
+        unzip_dest.to_string() + "/lib",
+        Path::new(&get_pg_config("--pkglibdir", pgbin)).to_path_buf(),
+    );
+    // move contents of the libdir / sharedir in unzipped archive to the correct local paths
+    for paths in [sharedir_paths, libdir_paths] {
+        let (zip_dir, real_dir) = paths;
+        info!("mv {zip_dir:?}/*  {real_dir:?}");
+        for file in std::fs::read_dir(zip_dir)? {
+            let old_file = file?.path();
+            let new_file =
+                Path::new(&real_dir).join(old_file.file_name().context("error parsing file")?);
+            info!("moving {old_file:?} to {new_file:?}");
+
+            // extension download failed: Directory not empty (os error 39)
+            match std::fs::rename(old_file, new_file) {
+                Ok(()) => info!("move succeeded"),
+                Err(e) => {
+                    warn!("move failed, probably because the extension already exists: {e}")
+                }
+            }
+        }
+    }
+    info!("done moving extension {ext_name}");
+    Ok(download_size)
+}
+
+// Create extension control files from spec
+pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
+    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
+    for ext_data in remote_extensions.extension_data.values() {
+        for (control_name, control_content) in &ext_data.control_data {
+            let control_path = local_sharedir.join(control_name);
+            if !control_path.exists() {
+                info!("writing file {:?}{:?}", control_path, control_content);
+                std::fs::write(control_path, control_content).unwrap();
+            } else {
+                warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_path);
+            }
+        }
+    }
+}
+
+// This function initializes the necessary structs to use remote storage
+pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
+    #[derive(Debug, serde::Deserialize)]
+    struct RemoteExtJson {
+        bucket: String,
+        region: String,
+        endpoint: Option<String>,
+        prefix: Option<String>,
+    }
+    let remote_ext_json = serde_json::from_str::<RemoteExtJson>(remote_ext_config)?;
+
+    let config = S3Config {
+        bucket_name: remote_ext_json.bucket,
+        bucket_region: remote_ext_json.region,
+        prefix_in_bucket: remote_ext_json.prefix,
+        endpoint: remote_ext_json.endpoint,
+        concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
+        max_keys_per_list_response: None,
+    };
+    let config = RemoteStorageConfig {
+        max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
+        max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
+        storage: RemoteStorageKind::AwsS3(config),
+    };
+    GenericRemoteStorage::from_config(&config)
+}
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -13,7 +13,7 @@ use hyper::{Body, Method, Request, Response, Server, StatusCode};
 use num_cpus;
 use serde_json;
 use tokio::task;
-use tracing::{error, info};
+use tracing::{error, info, warn};
 use tracing_utils::http::OtelName;

 fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
@@ -121,6 +121,78 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            }
        }

+        // download extension files from S3 on demand
+        (&Method::POST, route) if route.starts_with("/extension_server/") => {
+            info!("serving {:?} POST request", route);
+            info!("req.uri {:?}", req.uri());
+
+            // don't even try to download extensions
+            // if no remote storage is configured
+            if compute.ext_remote_storage.is_none() {
+                info!("no extensions remote storage configured");
+                let mut resp = Response::new(Body::from("no remote storage configured"));
+                *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                return resp;
+            }
+
+            let mut is_library = false;
+            if let Some(params) = req.uri().query() {
+                info!("serving {:?} POST request with params: {}", route, params);
+                if params == "is_library=true" {
+                    is_library = true;
+                } else {
+                    let mut resp = Response::new(Body::from("Wrong request parameters"));
+                    *resp.status_mut() = StatusCode::BAD_REQUEST;
+                    return resp;
+                }
+            }
+            let filename = route.split('/').last().unwrap().to_string();
+            info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}");
+
+            // get ext_name and path from spec
+            // don't lock compute_state for too long
+            let ext = {
+                let compute_state = compute.state.lock().unwrap();
+                let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+                let spec = &pspec.spec;
+
+                // debug only
+                info!("spec: {:?}", spec);
+
+                let remote_extensions = match spec.remote_extensions.as_ref() {
+                    Some(r) => r,
+                    None => {
+                        info!("no remote extensions spec was provided");
+                        let mut resp = Response::new(Body::from("no remote storage configured"));
+                        *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                        return resp;
+                    }
+                };
+
+                remote_extensions.get_ext(&filename, is_library)
+            };
+
+            match ext {
+                Ok((ext_name, ext_path)) => {
+                    match compute.download_extension(ext_name, ext_path).await {
+                        Ok(_) => Response::new(Body::from("OK")),
+                        Err(e) => {
+                            error!("extension download failed: {}", e);
+                            let mut resp = Response::new(Body::from(e.to_string()));
+                            *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                            resp
+                        }
+                    }
+                }
+                Err(e) => {
+                    warn!("extension download failed to find extension: {}", e);
+                    let mut resp = Response::new(Body::from("failed to find file"));
+                    *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                    resp
+                }
+            }
+        }
+
        // Return the `404 Not Found` for any other routes.
        _ => {
            let mut not_found = Response::new(Body::from("404 Not Found"));
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -139,6 +139,34 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/GenericError"
+  /extension_server:
+    post:
+      tags:
+      - Extension
+      summary: Download extension from S3 to local folder.
+      description: ""
+      operationId: downloadExtension
+      responses:
+        200:
+          description: Extension downloaded
+          content:
+            text/plain:
+              schema:
+                type: string
+                description: Error text or 'OK' if download succeeded.
+                example: "OK"
+        400:
+        description: Request is invalid.
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/GenericError"
+        500:
+        description: Extension download request failed.
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/GenericError"

 components:
  securitySchemes:
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -9,6 +9,7 @@ pub mod http;
 #[macro_use]
 pub mod logger;
 pub mod compute;
+pub mod extension_server;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -124,7 +124,7 @@ pub fn get_spec_from_control_plane(
 pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
    // File `postgresql.conf` is no longer included into `basebackup`, so just
    // always write all config into it creating new file.
-    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
+    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;

    update_pg_hba(pgdata_path)?;

@@ -270,7 +270,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            }
            RoleAction::Create => {
                let mut query: String = format!(
-                    "CREATE ROLE {} CREATEROLE CREATEDB IN ROLE neon_superuser",
+                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("role create query: '{}'", &query);
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -32,3 +32,4 @@ utils.workspace = true

 compute_api.workspace = true
 workspace_hack.workspace = true
+tracing.workspace = true
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -658,6 +658,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;

+            let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
+
            // If --safekeepers argument is given, use only the listed safekeeper nodes.
            let safekeepers =
                if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
@@ -699,7 +701,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    _ => {}
                }
                println!("Starting existing endpoint {endpoint_id}...");
-                endpoint.start(&auth_token, safekeepers)?;
+                endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
            } else {
                let branch_name = sub_args
                    .get_one::<String>("branch-name")
@@ -743,7 +745,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    pg_version,
                    mode,
                )?;
-                ep.start(&auth_token, safekeepers)?;
+                ep.start(&auth_token, safekeepers, remote_ext_config)?;
            }
        }
        "stop" => {
@@ -823,6 +825,16 @@ fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result<SafekeeperNod
    }
 }

+// Get list of options to append to safekeeper command invocation.
+fn safekeeper_extra_opts(init_match: &ArgMatches) -> Vec<String> {
+    init_match
+        .get_many::<String>("safekeeper-extra-opt")
+        .into_iter()
+        .flatten()
+        .map(|s| s.to_owned())
+        .collect()
+}
+
 fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    let (sub_name, sub_args) = match sub_match.subcommand() {
        Some(safekeeper_command_data) => safekeeper_command_data,
@@ -839,7 +851,9 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul

    match sub_name {
        "start" => {
-            if let Err(e) = safekeeper.start() {
+            let extra_opts = safekeeper_extra_opts(sub_args);
+
+            if let Err(e) = safekeeper.start(extra_opts) {
                eprintln!("safekeeper start failed: {}", e);
                exit(1);
            }
@@ -864,7 +878,8 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
                exit(1);
            }

-            if let Err(e) = safekeeper.start() {
+            let extra_opts = safekeeper_extra_opts(sub_args);
+            if let Err(e) = safekeeper.start(extra_opts) {
                eprintln!("safekeeper start failed: {}", e);
                exit(1);
            }
@@ -891,7 +906,7 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow

    for node in env.safekeepers.iter() {
        let safekeeper = SafekeeperNode::from_env(env, node);
-        if let Err(e) = safekeeper.start() {
+        if let Err(e) = safekeeper.start(vec![]) {
            eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
            try_stop_all(env, false);
            exit(1);
@@ -954,6 +969,14 @@ fn cli() -> Command {

    let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);

+    let safekeeper_extra_opt_arg = Arg::new("safekeeper-extra-opt")
+        .short('e')
+        .long("safekeeper-extra-opt")
+        .num_args(1)
+        .action(ArgAction::Append)
+        .help("Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo")
+        .required(false);
+
    let tenant_id_arg = Arg::new("tenant-id")
        .long("tenant-id")
        .help("Tenant id. Represented as a hexadecimal string 32 symbols length")
@@ -1003,6 +1026,12 @@ fn cli() -> Command {
        .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
        .required(false);

+    let remote_ext_config_args = Arg::new("remote-ext-config")
+        .long("remote-ext-config")
+        .num_args(1)
+        .help("Configure the S3 bucket that we search for extensions in.")
+        .required(false);
+
    let lsn_arg = Arg::new("lsn")
        .long("lsn")
        .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
@@ -1116,6 +1145,7 @@ fn cli() -> Command {
                .subcommand(Command::new("start")
                            .about("Start local safekeeper")
                            .arg(safekeeper_id_arg.clone())
+                            .arg(safekeeper_extra_opt_arg.clone())
                )
                .subcommand(Command::new("stop")
                            .about("Stop local safekeeper")
@@ -1126,6 +1156,7 @@ fn cli() -> Command {
                            .about("Restart local safekeeper")
                            .arg(safekeeper_id_arg)
                            .arg(stop_mode_arg.clone())
+                            .arg(safekeeper_extra_opt_arg)
                )
        )
        .subcommand(
@@ -1161,6 +1192,7 @@ fn cli() -> Command {
                    .arg(pg_version_arg)
                    .arg(hot_standby_arg)
                    .arg(safekeepers_arg)
+                    .arg(remote_ext_config_args)
                )
                .subcommand(
                    Command::new("stop")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -313,7 +313,7 @@ impl Endpoint {

                // TODO: use future host field from safekeeper spec
                // Pass the list of safekeepers to the replica so that it can connect to any of them,
-                // whichever is availiable.
+                // whichever is available.
                let sk_ports = self
                    .env
                    .safekeepers
@@ -420,7 +420,12 @@ impl Endpoint {
        Ok(())
    }

-    pub fn start(&self, auth_token: &Option<String>, safekeepers: Vec<NodeId>) -> Result<()> {
+    pub fn start(
+        &self,
+        auth_token: &Option<String>,
+        safekeepers: Vec<NodeId>,
+        remote_ext_config: Option<&String>,
+    ) -> Result<()> {
        if self.status() == "running" {
            anyhow::bail!("The endpoint is already running");
        }
@@ -488,6 +493,7 @@ impl Endpoint {
            pageserver_connstring: Some(pageserver_connstring),
            safekeeper_connstrings,
            storage_auth_token: auth_token.clone(),
+            remote_extensions: None,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -519,6 +525,11 @@ impl Endpoint {
            .stdin(std::process::Stdio::null())
            .stderr(logfile.try_clone()?)
            .stdout(logfile);
+
+        if let Some(remote_ext_config) = remote_ext_config {
+            cmd.args(["--remote-ext-config", remote_ext_config]);
+        }
+
        let child = cmd.spawn()?;

        // Write down the pid so we can wait for it when we want to stop
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -101,7 +101,7 @@ impl SafekeeperNode {
        self.datadir_path().join("safekeeper.pid")
    }

-    pub fn start(&self) -> anyhow::Result<Child> {
+    pub fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
        print!(
            "Starting safekeeper at '{}' in '{}'",
            self.pg_connection_config.raw_address(),
@@ -161,17 +161,28 @@ impl SafekeeperNode {

        let key_path = self.env.base_data_dir.join("auth_public_key.pem");
        if self.conf.auth_enabled {
+            let key_path_string = key_path
+                .to_str()
+                .with_context(|| {
+                    format!("Key path {key_path:?} cannot be represented as a unicode string")
+                })?
+                .to_owned();
            args.extend([
-                "--auth-validation-public-key-path".to_owned(),
-                key_path
-                    .to_str()
-                    .with_context(|| {
-                        format!("Key path {key_path:?} cannot be represented as a unicode string")
-                    })?
-                    .to_owned(),
+                "--pg-auth-public-key-path".to_owned(),
+                key_path_string.clone(),
+            ]);
+            args.extend([
+                "--pg-tenant-only-auth-public-key-path".to_owned(),
+                key_path_string.clone(),
+            ]);
+            args.extend([
+                "--http-auth-public-key-path".to_owned(),
+                key_path_string.clone(),
            ]);
        }

+        args.extend(extra_opts);
+
        background_process::start_process(
            &format!("safekeeper-{id}"),
            &datadir,
--- a/deny.toml
+++ b/deny.toml
@@ -4,7 +4,12 @@
 # to your expectations and requirements.

 # Root options
-targets = []
+targets = [
+    { triple = "x86_64-unknown-linux-gnu" },
+    { triple = "aarch64-unknown-linux-gnu" },
+    { triple = "aarch64-apple-darwin" },
+    { triple = "x86_64-apple-darwin" },
+]
 all-features = false
 no-default-features = false
 feature-depth = 1
@@ -18,7 +23,7 @@ vulnerability = "deny"
 unmaintained = "warn"
 yanked = "warn"
 notice = "warn"
-ignore = []
+ignore = ["RUSTSEC-2023-0052"]

 # This section is considered when running `cargo deny check licenses`
 # More documentation for the licenses section can be found here:
--- a/docs/rfcs/024-extension-loading.md
+++ b/docs/rfcs/024-extension-loading.md
@@ -0,0 +1,236 @@
+# Supporting custom user Extensions (Dynamic Extension Loading)
+Created 2023-05-03
+
+## Motivation
+
+There are many extensions in the PostgreSQL ecosystem, and not all extensions
+are of a quality that we can confidently support them. Additionally, our
+current extension inclusion mechanism has several problems because we build all
+extensions into the primary Compute image: We build the extensions every time
+we build the compute image regardless of whether we actually need to rebuild
+the image, and the inclusion of these extensions in the image adds a hard
+dependency on all supported extensions - thus increasing the image size, and
+with it the time it takes to download that image - increasing first start
+latency.
+
+This RFC proposes a dynamic loading mechanism that solves most of these
+problems.
+
+## Summary
+
+`compute_ctl` is made responsible for loading extensions on-demand into
+the container's file system for dynamically loaded extensions, and will also
+make sure that the extensions in `shared_preload_libraries` are downloaded
+before the compute node starts.
+
+## Components
+
+compute_ctl, PostgreSQL, neon (extension), Compute Host Node, Extension Store
+
+## Requirements
+
+Compute nodes with no extra extensions should not be negatively impacted by
+the existence of support for many extensions.
+
+Installing an extension into PostgreSQL should be easy.
+
+Non-preloaded extensions shouldn't impact startup latency.
+
+Uninstalled extensions shouldn't impact query latency.
+
+A small latency penalty for dynamically loaded extensions is acceptable in
+the first seconds of compute startup, but not in steady-state operations.
+
+## Proposed implementation
+
+### On-demand, JIT-loading of extensions
+
+Before postgres starts we download 
+- control files for all extensions available to that compute node;
+- all `shared_preload_libraries`;
+
+After postgres is running, `compute_ctl` listens for requests to load files.
+When PostgreSQL requests a file, `compute_ctl` downloads it.
+
+PostgreSQL requests files in the following cases:
+- When loading a preload library set in `local_preload_libraries`
+- When explicitly loading a library with `LOAD`
+- Wnen creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files)))
+
+
+#### Summary
+
+Pros:
+ - Startup is only as slow as it takes to load all (shared_)preload_libraries
+ - Supports BYO Extension
+
+Cons:
+ - O(sizeof(extensions)) IO requirement for loading all extensions.
+
+### Alternative solutions
+
+1. Allow users to add their extensions to the base image
+   
+   Pros:
+    - Easy to deploy
+
+   Cons:
+    - Doesn't scale - first start size is dependent on image size;
+    - All extensions are shared across all users: It doesn't allow users to
+      bring their own restrictive-licensed extensions
+
+2. Bring Your Own compute image
+   
+   Pros:
+    - Still easy to deploy
+    - User can bring own patched version of PostgreSQL
+
+   Cons:
+    - First start latency is O(sizeof(extensions image))
+    - Warm instance pool for skipping pod schedule latency is not feasible with
+      O(n) custom images
+    - Support channels are difficult to manage
+
+3. Download all user extensions in bulk on compute start
+   
+   Pros:
+    - Easy to deploy
+    - No startup latency issues for "clean" users.
+    - Warm instance pool for skipping pod schedule latency is possible
+
+   Cons:
+    - Downloading all extensions in advance takes a lot of time, thus startup
+      latency issues
+
+4. Store user's extensions in persistent storage
+   
+   Pros:
+    - Easy to deploy
+    - No startup latency issues
+    - Warm instance pool for skipping pod schedule latency is possible
+
+   Cons:
+    - EC2 instances have only limited number of attachments shared between EBS
+      volumes, direct-attached NVMe drives, and ENIs.
+    - Compute instance migration isn't trivially solved for EBS mounts (e.g.
+      the device is unavailable whilst moving the mount between instances).
+    - EBS can only mount on one instance at a time (except the expensive IO2
+      device type).
+
+5. Store user's extensions in network drive
+   
+   Pros:
+    - Easy to deploy
+    - Few startup latency issues
+    - Warm instance pool for skipping pod schedule latency is possible
+
+   Cons:
+    - We'd need networked drives, and a lot of them, which would store many
+      duplicate extensions.
+    - **UNCHECKED:** Compute instance migration may not work nicely with
+      networked IOs
+
+
+### Idea extensions
+
+The extension store does not have to be S3 directly, but could be a Node-local
+caching service on top of S3. This would reduce the load on the network for
+popular extensions.
+
+## Extension Storage implementation
+
+The layout of the S3 bucket is as follows:
+```
+5615610098 // this is an extension build number
+├── v14
+│   ├── extensions
+│   │   ├── anon.tar.zst
+│   │   └── embedding.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   ├── anon.tar.zst
+    │   └── embedding.tar.zst
+    └── ext_index.json
+5615261079
+├── v14
+│   ├── extensions
+│   │   └── anon.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   └── anon.tar.zst
+    └── ext_index.json
+5623261088
+├── v14
+│   ├── extensions
+│   │   └── embedding.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   └── embedding.tar.zst
+    └── ext_index.json
+```
+
+Note that build number cannot be part of prefix because we might need extensions
+from other build numbers.
+
+`ext_index.json` stores the control files and location of extension archives. 
+It also stores a list of public extensions and a library_index
+
+We don't need to duplicate `extension.tar.zst`` files.
+We only need to upload a new one if it is updated.
+(Although currently we just upload every time anyways, hopefully will change
+this sometime)
+
+*access* is controlled by spec
+
+More specifically, here is an example ext_index.json
+```
+{
+    "public_extensions": [
+        "anon",
+        "pg_buffercache"
+    ],
+    "library_index": {
+        "anon": "anon",
+        "pg_buffercache": "pg_buffercache"
+        // for more complex extensions like postgis
+        // we might have something like:
+        // address_standardizer: postgis
+        // postgis_tiger: postgis
+    },
+    "extension_data": {
+        "pg_buffercache": {
+            "control_data": {
+                "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
+            },
+            "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
+        },
+        "anon": {
+            "control_data": {
+                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
+            },
+            "archive_path": "5670669815/v14/extensions/anon.tar.zst"
+        }
+    }
+}
+```
+
+### How to add new extension to the Extension Storage?
+
+Simply upload build artifacts to the S3 bucket.
+Implement a CI step for that. Splitting it from compute-node-image build.
+
+### How do we deal with extension versions and updates?
+
+Currently, we rebuild extensions on every compute-node-image build and store them in the <build-version> prefix.
+This is needed to ensure that `/share` and `/lib` files are in sync.
+
+For extension updates, we rely on the PostgreSQL extension versioning mechanism (sql update scripts) and extension authors to not break backwards compatibility within one major version of PostgreSQL.
+
+### Alternatives
+
+For extensions written on trusted languages we can also adopt
+`dbdev` PostgreSQL Package Manager based on `pg_tle` by Supabase.
+This will increase the amount supported extensions and decrease the amount of work required to support them.
--- a/docs/rfcs/026-pageserver-s3-mvcc.md
+++ b/docs/rfcs/026-pageserver-s3-mvcc.md
@@ -0,0 +1,316 @@
+This is a copy from the [original Notion page](https://www.notion.so/neondatabase/Proposal-Pageserver-MVCC-S3-Storage-8a424c0c7ec5459e89d3e3f00e87657c?pvs=4), taken on 2023-08-16.
+
+This is for archival mostly.
+The RFC that we're likely to go with is https://github.com/neondatabase/neon/pull/4919.
+
+---
+
+# Proposal: Pageserver MVCC S3 Storage
+
+tl;dr: this proposal enables Control Plane to attach a tenant to a new pageserver without being 100% certain that it has been detached from the old pageserver. This enables us to automate failover if a pageserver dies (no human in the loop).
+
+# Problem Statement
+
+The current Neon architecture requires the Control Plane to guarantee that a tenant is only attached to one pageserver at a time. If a tenant is attached to multiple pageservers simultaneously, the pageservers will overwrite each other’s changes in S3 for that tenant, resulting in data loss for that tenant.
+
+The above imposes limitations on tenant relocation and future designs for high availability. For instance, Control Plane cannot relocate a tenant to another pageserver before it is 100% certain that the tenant is detached from the source pageserver. If the source pageserver is unresponsive, the tenant detach procedure cannot proceed, and Control Plane has no choice but to wait for either the source to become responsive again, or rely on a node failure detection mechanism to detect that the source pageserver is dead, and give permission to skip the detachment step. Either way, the tenant is unavailable for an extended period, and we have no means to improve it in the current architecture.
+
+Note that there is no 100% correct node failure detection mechanism, and even techniques to accelerate failure detection, such as ********************************shoot-the-other-node-in-the-head,******************************** have their limits. So, we currently rely on humans as node failure detectors: they get alerted via PagerDuty, assess the situation under high stress, and make the decision. If they make the wrong call, or the apparent dead pageserver somehow resurrects later, we’ll have data loss.
+
+Also, by relying on humans, we’re [incurring needless unscalable toil](https://sre.google/sre-book/eliminating-toil/): as Neon grows, pageserver failures will become more and more frequent because our fleet grows. Each instance will need quick response time to minimize downtime for the affected tenants, which implies higher toil, higher resulting attrition, and/or higher personnel cost.
+
+Lastly, there are foreseeable needs by operation and product such as zero-downtime relocation and automatic failover/HA. For such features, the ability to have a tenant purposefully or accidentally attached to more than one pageserver will greatly reduce risk of data loss, and improve availability.
+
+# High-Level Idea
+
+The core idea is to evolve the per-Tenant S3 state to an MVCC-like scheme, allowing multiple pageservers to operate on the same tenant S3 state without interference. To make changes to S3, pageservers acquire long-running transactions from Control Plane. After opening a transaction, Pageservers make PUTs directly against S3, but they keys include the transaction ID,  so overwrites never happen. Periodically, pageservers talk back to Control Plane to commit their transaction. This is where Control Plane enforces strict linearizability, favoring availability over work-conservation: commit is only granted if no transaction started after the one that’s requesting commit. Garbage collection is done through deadlists, and it’s simplified tremendously by above commit grant/reject policy.
+
+Minimal changes are required for safekeepers to allow WAL for a single timeline be consumed by more than one pageserver without premature truncation.
+
+**Above scheme makes it safe to attach tenants without a 100% correct node failure detection mechanism. Further, it makes it safe to interleave tenant-attachment to pageservers, unlocking new capabilities for (internal) product features:**
+
+- **Fast, Zero-Toil Failover on Network Partitions or Instance Failure**: if a pageserver is not reachable (network partition, hardware failure, overload) we want to spread its attached tenants to new pageservers to restore availability, within the range of *seconds*. We cannot afford gracious timeouts to maximize the probability that the unreachable pageserver has ceased writing to S3. This proposal enables us to attach the tenants to the replacement pageservers,  and redirect their computes, without having to wait for confirmation that the unreachable pageserver has ceased writing to S3.
+- **************************************Zero-Downtime Relocation:************************************** we want to be able to relocate tenants to different pageservers with minimized availability or a latency impact. This proposal enables us to attach the relocating Tenant to the destination Pageserver before detaching it from the source Pageserver. This can help minimize downtime because we can wait for the destination to catch up on WAL processing before redirecting Computes.
+
+# Design
+
+The core idea is to evolve the per-Tenant S3 state to a per-tenant MVCC-like scheme.
+
+To make S3 changes for a given tenant, Pageserver requests a transaction ID from control plane for that tenant. Without a transaction ID, Pageserver does not write to S3.
+
+Once Pageserver received a transaction ID it is allowed to produce new objects and overwrite objects created in this transaction. Pageserver is not allowed to delete any objects; instead, it marks the object as deleted by appending the key to the transaction’s deadlist for later deletion. Commits of transactions are serialized through Control Plane: when Pageserver wants to commit a transaction, it sends an RPC to Control Plane. Control Plane responds with a commit grant or commit reject message. Commit grant means that the transaction’s changes are now visible to subsequent transactions. Commit reject means that the transaction’s changes are not and never will be visible to another Pageserver instance, and the rejected Pageserver is to cease further activity on that tenant.
+
+## ****************************************************Commit grant/reject policy****************************************************
+
+For the purposes of Pageserver, we want **linearizability** of a tenant’s S3 state. Since our transactions are scoped per tenant, it is sufficient for linearizability to grant commit if and only if no other transaction has been started since the commit-requesting transaction started.
+
+For example, consider the case of a single tenant, attached to Pageserver A. Pageserver A has an open transaction but becomes unresponsive. Control Plane decides to relocate the tenant to another Pageserver B. It need *not* wait for A to be 100%-certainly down before B can start uploading to S3 for that tenant. Instead, B can start a new transaction right away, make progress, and get commit grants;  What about A? The transaction is RejectPending in Control Plane until A eventually becomes responsive again, tries to commit, gets a rejection, acknowledges it, and thus its transaction becomes RejectAcknowledge. If A is definitively dead, operator can also force-transition from state RejectPending to RejectAcknowledged. But critically, Control Plane doesn’t have for A’s transaction to become RejectAcknowledge before attaching the tenant to B.
+
+```mermaid
+sequenceDiagram
+
+   participant CP
+   participant A
+   participant S3
+   participant B
+
+	 CP -->> A: attach tenant
+   activate A
+	 A -->> CP: start txn
+	 CP -->> A: txn=23, last_committed_txn=22
+
+
+	 Note over CP,A: network partition
+	 CP --x A: heartbeat
+	 CP --x A: heartbeat
+
+	 Note over CP: relocate tenant to avoid downtime
+	 CP -->> B: attach tenant
+   activate B
+	 B -->> CP: start txn
+   Note over CP: mark A's txn 23 as RejectPending
+	 CP -->> B: txn=24, last-committed txn is 22
+	 B -->> S3: PUT X.layer.24<br>PUT index_part.json.24 referencing X.layer.24
+	 B -->> CP: request commit
+	 CP -->> B: granted
+   B -->> CP: start txn
+  CP -->> B: txn=25, last_committed_txn=22
+
+   A -->> S3: PUT Y.layer.23 <br> PUT index_part.json.23 referencing Y.layer.23
+  A --x CP: request commit
+	 A --x CP: request commit
+
+   Note over CP,A: partition is over
+
+   A -->> CP: request commit
+
+   Note over CP: most recently started txn is 25, not 23, reject
+
+   CP -->> A: reject
+   A -->> CP: acknowledge reject
+
+   Note over CP: mark A's txn 23 as RejectAcknowledged
+
+  deactivate A
+
+  B -->> S3: PUT 000-FFF_X-Y.layer.**************25**************<br>...
+
+  deactivate B
+
+
+```
+
+If a Pageserver gets a rejection to a commit request, it acknowledges rejection and cedes further S3 uploads for the tenant, until it receives a `/detach` request for the tenant (control plane has most likely attached the tenant to another pageserver in the meantime).
+
+In practice, Control Plane will probably extend the commit grant/reject schema above, taking into account the pageserver to which it last attached the tenant. In the above example, Control Plane could remember that the pageserver that is supposed to host the tenant is pageserver B, and reject start-txn and commit requests from pageserver A. It would also use such requests from A as a signal that A is reachable again, and retry the `/detach` .
+
+<aside>
+💡 A commit failure causes the tenant to become effectively `Broken`. Pageserver should persist this locally so it doesn’t bother ControlPlane for a new txn when Pageserver is restarted.
+
+</aside>
+
+## ********************Visibility********************
+
+We mentioned earlier that once a transaction commits, its changes are visible to subsequent transactions. But how does a given transaction know where to look for the data? There is no longer a single `index_part.json` per timeline, or a single `timelines/:timeline_id` prefix to look for; they’re all multi-versioned, suffixed by the txn number.
+The solution is: at transaction start, Pageserver receives the last-committed transaction ID from Control Plane (`last_committed_txn` in the diagram). last_commited_txn is the upper bound for what is visible for the current transaction. Control Plane keeps track of each open transaction’s last_committed_txn for purposes of garbage collection (see later paragraph).
+Equipped with last_committed_txn, Pageserver then discovers
+
+- the current index part of a timeline at `tenants/:tenant_id/timelines/:timeline_id/index_part.json.$last_committed_txn`. The `index_part.json.$last_committed_txn` has the exact same contents as the current architecture’s index_part.json, i.e. full list of layers.
+- the list of existent timelines as part of the `attach` RPC from CP;
+
+There is no other S3 state per tenant, so, that’s all the visibility required.
+An alternative to receiving the list of existent timelines from CP is to introduce a proper **********SetOfTimelines********** object in S3, and multi-version it just like above. For example, we could have a `tenants/:tenant_id/timelines.json.$txn` file that references `index_part.json.$last_committed_txn` . It can be added later if more separation between CP and PS is desired.
+
+So, the only MVCC’ed object types in this proposal are LayerFile and IndexPart (=individual timeline), but not the SetOfTimelines in a given tenant. Is this a problem? For example, the Pageserver’s garbage collection code needs to know the full set of timelines of a tenant. Otherwise it’ll make incorrect decisions. What if Pageserver A knows about timelines {R,S}, but another Pageserver B created an additional branch T, so, its set of timelines is {R,S,T}. Both pageservers will run GC code, and so, PS A may decide to delete a layer that’s still needed for branch T. Not a problem with this propsoal, because the effect of GC (i.e., layer deletion) is properly MVCC’ed.
+
+## Longevity Of Transactions & Availability
+
+Pageserver depends on Control Plane to start a new transaction. If ControlPlane is down, no new transactions can be started.
+
+Pageservers commit transactions based on a maximum amount of uncommitted changes that have accumulated in S3. A lower maximum increases dependence and load on ControlPlane which decreases availability. A higher maximum risks losing more work in the event of failover; the work will have to be re-done in a new transaction on the new node.
+
+Pageservers are persist the open txn id in local storage, so that they can resume the transaction after restart, without dependence on Control Plane.
+
+## **Operations**
+
+********PUTs:********
+
+- **layer files**
+    - current architecture: layer files are supposed to be write-once, but actually, there are edge-cases where we PUT the same layer file name twice; namely if we PUT the file to S3 but crash before uploading the index part that references it; then detach + attach, and re-run compaction, which is non-deterministic.
+    - this proposal: with transactions, we can now upload layers and index_part.json concurrently, just need to make sure layer file upload is done before we request txn commit.
+- **index part** upload: `index_part.json.$txn` may be created and subsequently overwritten multiple times in a transaction; it is an availability/work-loss trade-off how often to request a commit from CP.
+
+**************DELETEs**************: for deletion, we maintain a deadlist per transaction. It is located at `tenants/:tenant_id/deadlist/deadlist.json.$txn`. It is PUT once before the pageserver requests requests commit, and not changed after sending request to commit. An object created in the current txn need not (but can) be on the deadlist — it can be DELETEd immediately because it’s not visible to other transactions. An example use case would be an L0 layer that gets compacted within one transaction; or, if we ever start MVCC’ing the set of timelines of a tenant, a short-lived branch that is created & destroyed within one transaction.
+
+<aside>
+☝ **Deadlist Invariant:** if a an object is on a deadlist of transaction T, it is not referenced from anywhere else in the full state visible to T or any later started transaction > T.
+
+</aside>
+
+### Rationale For Deadlist.json
+
+Given that this proposal only MVCC’s layers and indexparts, one may ask why the deadlist isn’t part of indexpart. The reason is to not lose generality: the deadlist is just a list of keys; it is not necessary to understand the data format of the versioned object to process the deadlist. This is important for garbage collection / vacuuming, which we’ll come to in the next section.
+
+## Garbage Collection / Vacuuming
+
+After a transaction has reached reject-acknowledged state,  Control Plane initiates a garbage collection procedure for the aborted transaction.
+
+Control Plane is in the unique position about transaction states. Here is a sketch of the exact transaction states and what Control Plane keeps track of.
+
+```
+struct Tenant {
+  ...
+
+  txns: HashMap<TxnId, Transaction>,
+  // the most recently started txn's id; only most recently sarted can win
+  next_winner_txn: Option<TxnId>,
+}
+struct Transaction {
+  id: TxnId, // immutable
+  last_committed_txn: TxnId, // immutable; the most recent txn in state `Committed`
+                             // when self was started
+  pageserver_id: PageserverId,
+  state: enum {
+    Open,
+    Committed,
+    RejectPending,
+    RejectAcknowledged, // invariant: we know all S3 activity has ceded
+    GarbageCollected,
+  }
+}
+```
+
+Object creations & deletions by a rejected transaction have never been visible to other transactions. That is true for both RejectPending and RejectAcknowledged states. The difference is that, in RejectPending, the pageserver may still be uploading to S3, whereas in RejectAcknowledged, Control Plane can be certain that all S3 activity in the name of that transaction has ceded. So, once a transaction reaches state RejectAcknowledged state, it is safe to DELETE all objects created by that transaction, and discard the transaction’s deadlists.
+
+A transaction T in state Committed has subsequent transactions that may or may not reference the objects it created. None of the subsequent transaction can reference the objects on T’s deadlist, though, as per the Deadlist Invariant (see previous section).
+
+So, for garbage collection, we need to assess transactions in state Committed and RejectAcknowledged:
+
+- Commited: delete objects on the deadlist.
+    - We don’t need a LIST request here, the deadlist is sufficient. So, it’s really cheap.
+    - This is **not true MVCC garbage collection**; by deleting the objects on Committed transaction T ’s deadlist, we might delete data referenced by other transactions that were concurrent with T, i.e., they started while T was still open. However, the fact that T is committed means that the other transactions are RejectPending or RejectAcknowledged, so, they don’t matter. Pageservers executing these doomed RejectPending transactions must handle 404 for GETs gracefully, e.g., by trying to commit txn so they observe the rejection they’re destined to get anyways. 404’s for RejectAcknowledged is handled below.
+- RejectAcknowledged: delete all objects created in that txn, and discard deadlists.
+    - 404s / object-already-deleted type messages must be expected because of Committed garbage collection (see above)
+    - How to get this list of objects created in a txn? Open but solvable design question; Ideas:
+        - **Brute force**: within tenant prefix, search for all keys ending in `.$txn` and delete them.
+        - **WAL for PUTs**: before a txn PUTs an object, it logs to S3, or some other equivalently durable storage, that it’s going to do it. If we log to S3, this means we have to do an additional WAL PUT per “readl” PUT.
+        - ******************************LIST with reorg’ed S3 layout (preferred one right now):****************************** layout S3 key space such that `$txn` comes first, i.e., `tenants/:tenant_id/$txn/timelines/:timeline_id/*.json.$txn` . That way, when we need to GC a RejectAcknowledged txn, we just LIST the entire `tenants/:tenant_id/$txn` prefix and delete it. The cost of GC for RejectAcknowledged transactions is thus proportional to the number of objects created in that transaction.
+
+## Branches
+
+This proposal only MVCC’s layer files and and index_part.json, but leaves the tenant object not-MVCCed. We argued earlier that it’s fine to ignore this for now, because
+
+1. Control Plane can act as source-of-truth for the set of timelines, and
+2. The only operation that makes decision based on “set of timelines” is GC, which in turn only does layer deletions, and layer deletions ***are*** properly MVCC’ed.
+
+Now that we’ve introduced garbage collection, let’s elaborate a little more on (2). Recall our example from earlier: Pageserver A knows about timelines {R,S}, but another Pageserver B created an additional branch T, so, its set of timelines is {R,S,T}. Both pageservers will run GC code, and so, PS A may decide to delete a layer that’s still needed for branch T.
+
+How does the MVCC’ing of layer files protect us here? If A decides to delete that layer, it’s just on A’s transaction’s deadlist, but still present in S3 and usable by B. If A commits first, B won’t be able to commit and the layers in timeline T will be vacuumed. If B commits first, A’s deadlist is discarded and the layer continues to exist.
+
+## Safekeeper Changes
+
+We need to teach the safekeepers that there can be multiple pageservers requesting WAL for the same timeline, in order to prevent premature WAL truncation.
+
+In the current architecture, the Safekeeper service currently assumes only one Pageserver and is allowed to prune WAL older than that Pageserver’s `remote_consistent_lsn`. Safekeeper currently learns the `remote_consistent_lsn` through the walreceiver protocol.
+
+So, if we have a tenant attached to two pageservers at the same time, they will both try to stream WAL and the Safekeeper will get confused about which connection’s `remote_consistent_lsn` to use as a basis for WAL pruning.
+
+What do we need to change to make it work? We need to make sure that the Safekeepers only prune WAL up to the `remote_consistent_lsn` of the last-committed transaction.
+
+The straight-forward way to get it is to re-design WAL pruning as follows:
+
+1. Pageserver reports remote_consistent_lsn as part of transaction commit to Control Plane.
+2. Control Plane makes sure transaction state update is persisted.
+3. Control Plane (asynchronous to transaction commit) reconciles with Safekeepers to ensure WAL pruning happens.
+
+The above requires non-trivial changes, but, in the light of other planned projects such as restore-tenant-from-safekeeper-wal-backups, I think Control Plane will need to get involved in WAL pruning anyways.
+
+# How This Proposal Unlocks Future Features
+
+Let us revisit the example from the introduction where we were thinking about handling network partitions. Network partitions need to be solved first, because they’re unavoidable in distributed systems. We did that. Now let’s see how we can solve actual product problems:
+
+## **Fast, Zero-Toil Failover on Network Partitions or Instance Failure**
+
+The “Problem Statement” section outlined the current architecture’s problems with regards to network partitions or instance failure: it requires a 100% correct node-dead detector to make decisions, which doesn’t exist in reality. We rely instead on human toil: an oncall engineer has to inspect the situation and make a decision, which may be incorrect and in any case take time in the order of minutes, which means equivalent downtime for users.
+
+With this proposal, automatic failover for pageservers is trivial:
+
+If a pageserver is unresponsive from Control Plane’s / Compute’s perspective, Control Plane does the following:
+
+- attach all tenants of the unresponsive pageserver to new pageservers
+- switch over these tenants’ computes immediately;
+
+At this point, availability is restored and user pain relieved.
+
+What’s left is to somehow close the doomed transaction of the unresponsive pageserver, so that it beomes RejectAcknowledged, and GC can make progress. Since S3 is cheap, we can afford to wait a really long time here, especially if we put a soft bound on the amount of data a transaction may produce before it must commit. Procedure:
+
+1. Ensure the unresponsive pageserver is taken out of rotation for new attachments. That probably should happen as part of the routine above.
+2. Make a human operator investigate decide what to do (next morning, NO ONCALL ALERT):
+    1. Inspect the instance, investigate logs, understand root cause.
+    2. Try to re-establish connectivity between pageserver and Control Plane so that pageserver can retry commits, get rejected, ack rejection ⇒ enable GC.
+    3. Use below procedure to decomission pageserver.
+
+### Decomissioning A Pageserver (Dead or Alive-but-Unrespsonive)
+
+The solution, enabled by this proposal:
+
+1. Ensure that pageserver’s S3 credentials are revoked so that it cannot make new uploads, which wouldn’t be tracked anywhere.
+2. Let enough time pass for the S3 credential revocation to propagate. Amazon doesn’t give a guarantee here. As stated earlier, we can easily afford to wait here.
+3. Mark all Open and RejectPending transactions of that pageserver as RejectAcknowledge.
+
+Revocation of the S3 credentials is required so that, once we transition all the transactions of that pageserver to RejectAcknowledge, once garbage-collection pass is guaranteed to delete all objects that will ever exist for that pageserver. That way, we need not check *****GarbageCollected***** transactions every again.
+
+## Workflow: Zero-Downtime Relocation
+
+With zero-downtime relocation, the goal is to have the target pageserver warmed up, i.e., at the same `last_record_lsn` as the source pageserver, before switching over Computes from source to target pageserver.
+
+With this proposal, it works like so:
+
+1. Grant source pageserver its last open transaction. This one is doomed to be rejected later, unless the relocation fails.
+2. Grant target pageserver its first open transaction.
+3. Have target pageserver catch up on WAL, streaming from last-committed-txn’s remote_consistent_lsn onwards.
+4. Once target pageserver reports `last_record_lsn` close enough to source pageserver, target pageserver requests commit.
+5. Drain compute traffic from source to target pageserver. (Source can still answer requests until it tries to commit and gets reject, so, this will be quite smooth).
+
+Note that as soon as we complete step (4), the source pageserver’s transaction is doomed to be rejected later. Conversely, if the target can’t catch up fast enough, the source will make a transaction commit earlier. This will generally happen if there is a lot of write traffic coming in. The design space to make thing smooth here is large, but well explored in other areas of computing, e.g., VM live migration. We have all the important policy levers at hand, e.g.,
+
+- delaying source commits if we see target making progress
+- slowing down source consumption (need some signalling mechanism for it)
+- slowing down compute wal generation
+- …
+
+It doesn’t really matter, what’s important is that two pageservers can overlap.
+
+# Additional Trade-Offs / Remarks Brought Up During Peer Review
+
+This proposal was read by and discussed @Stas and @Dmitry Rodionov prior to publishing it with the broader team. (This does not mean they endorse this proposal!).
+
+Issues that we discussed:
+
+1. **Frequency of transactions:** If even idle tenants commit every 10min or so, that’s quite a lot of load on Control Plane. Can we minimize it by Equating Transaction Commit Period to Attachment Period? I.e. start txn on attach, commit on detach?
+    1. Would be nice, but, if a tenant is attached for 1 month, then PS dies, we lose 1 month of work.
+    2. ⇒ my solution to this problem: Adjusted this proposal to make transaction commit frequency proportional to amount of uncommitted data.
+        1. It’s ok to spend resources on active users, they pay us money to do it!
+        2. The amount of work per transaction is minimal.
+            1. In current Control Plane, it’s a small database transaction that is super unlikely to conflict with other transactions.
+            2. I have very little concerns about scalability of the commit workload on CP side because it's trivially horizontally scalable by sharding by tenant.
+        3. There's no super stringent availability requirement on control plane; if a txn can't commit because it can't reach the CP, PS can continue & retry in the background, speculating that it's CP downtime and not PS-partitioned-off scenario.
+        4. Without stringent availability requirement, there's flexibility for future changes to CP-side-implementation.
+2. ************************************************Does this proposal address mirroring / no-performance-degradation failover ?************************************************
+    1. No it doesn’t. It only provides the building block for attaching a tenant to a new pageserver without having to worry that the tenant is detached on the old pageserver.
+    2. A simple scheme to build no-performance-degradation failover on top of this proposal is to have an asynchronous read-only replica of a tenant on another pageserver in the same region.
+    3. Another more ambitious scheme to get no-performance-degradation would be [One-Pager: Layer File Spreading (Christian)](https://www.notion.so/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843?pvs=21); this proposal would be used in layer file spreading for risk-free automation of TenantLeader failover, which hasn’t been addressed Ithere.
+    4. In any way, failover would restart from an older S3 state, and need to re-ingest WAL before being able to server recently written pages.
+        1. Is that a show-stopper? I think not.
+        2. Is it suboptimal? Absolutely: if a pageserver instance fails, all its tenants will be distributed among the remaining pageservers (OK), and all these tenants will ask the safekeepers for WAL at the same time (BAD). So, pageserver instance failure will cause a load spike in safekeepers.
+            1. Personally I think that’s an OK trade-off to make.
+            2. There are countless options to avoid / mitigate the load spike. E.g., pro-actively streaming WAL to the standby read-only replica.
+
+3. ********************************************Does this proposal allow multiple writers for a tenant?********************************************
+    1. In abstract terms, this proposal provides a linearized history for a given S3 prefix.
+    2. In concrete terms, this proposal provides a linearized history per tenant.
+    3. There can be multiple writers at a given time, but only one of them will win to become part of the linearized history.
+4. ************************************************************************************Alternative ideas mentioned during meetings that should be turned into a written prospoal like this one:************************************************************************************
+    1. @Dmitry Rodionov : having linearized storage of index_part.json in some database that allows serializable transactions / atomic compare-and-swap PUT
+    2. @Dmitry Rodionov :
+    3. @Stas : something like this scheme, but somehow find a way to equate attachment duration with transaction duration, without losing work if pageserver dies months after attachment.
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -10,6 +10,9 @@ chrono.workspace = true
 serde.workspace = true
 serde_with.workspace = true
 serde_json.workspace = true
+regex.workspace = true

 utils = { path = "../utils" }
+remote_storage = { version = "0.1", path = "../remote_storage/" }
+
 workspace_hack.workspace = true
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -68,14 +68,45 @@ where
 /// Response of the /metrics.json API
 #[derive(Clone, Debug, Default, Serialize)]
 pub struct ComputeMetrics {
+    /// Time spent waiting in pool
    pub wait_for_spec_ms: u64,
-    pub sync_safekeepers_ms: u64,
+
+    /// Time spent checking if safekeepers are synced
    pub sync_sk_check_ms: u64,
+
+    /// Time spent syncing safekeepers (walproposer.c).
+    /// In most cases this should be zero.
+    pub sync_safekeepers_ms: u64,
+
+    /// Time it took to establish a pg connection to the pageserver.
+    /// This is two roundtrips, so it's a good proxy for compute-pageserver
+    /// latency. The latency is usually 0.2ms, but it's not safe to assume
+    /// that.
+    pub pageserver_connect_micros: u64,
+
+    /// Time to get basebackup from pageserver and write it to disk.
    pub basebackup_ms: u64,
+
+    /// Compressed size of basebackup received.
    pub basebackup_bytes: u64,
+
+    /// Time spent starting potgres. This includes initialization of shared
+    /// buffers, preloading extensions, and other pg operations.
    pub start_postgres_ms: u64,
+
+    /// Time spent applying pg catalog updates that were made in the console
+    /// UI. This should be 0 when startup time matters, since cplane tries
+    /// to do these updates eagerly, and passes the skip_pg_catalog_updates
+    /// when it's safe to skip this step.
    pub config_ms: u64,
+
+    /// Total time, from when we receive the spec to when we're ready to take
+    /// pg connections.
    pub total_startup_ms: u64,
+    pub load_ext_ms: u64,
+    pub num_ext_downloaded: u64,
+    pub largest_ext_size: u64, // these are measured in bytes
+    pub total_ext_download_size: u64,
 }

 /// Response of the `/computes/{compute_id}/spec` control-plane API.
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -3,11 +3,16 @@
 //! The spec.json file is used to pass information to 'compute_ctl'. It contains
 //! all the information needed to start up the right version of PostgreSQL,
 //! and connect it to the storage nodes.
+use std::collections::HashMap;
+
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

+use regex::Regex;
+use remote_storage::RemotePath;
+
 /// String type alias representing Postgres identifier and
 /// intended to be used for DB / role names.
 pub type PgIdent = String;
@@ -60,6 +65,56 @@ pub struct ComputeSpec {
    /// If set, 'storage_auth_token' is used as the password to authenticate to
    /// the pageserver and safekeepers.
    pub storage_auth_token: Option<String>,
+
+    // information about available remote extensions
+    pub remote_extensions: Option<RemoteExtSpec>,
+}
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+pub struct RemoteExtSpec {
+    pub public_extensions: Option<Vec<String>>,
+    pub custom_extensions: Option<Vec<String>>,
+    pub library_index: HashMap<String, String>,
+    pub extension_data: HashMap<String, ExtensionData>,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ExtensionData {
+    pub control_data: HashMap<String, String>,
+    pub archive_path: String,
+}
+
+impl RemoteExtSpec {
+    pub fn get_ext(
+        &self,
+        ext_name: &str,
+        is_library: bool,
+    ) -> anyhow::Result<(String, RemotePath)> {
+        let mut real_ext_name = ext_name;
+        if is_library {
+            // sometimes library names might have a suffix like
+            // library.so or library.so.3. We strip this off
+            // because library_index is based on the name without the file extension
+            let strip_lib_suffix = Regex::new(r"\.so.*").unwrap();
+            let lib_raw_name = strip_lib_suffix.replace(real_ext_name, "").to_string();
+
+            real_ext_name = self
+                .library_index
+                .get(&lib_raw_name)
+                .ok_or(anyhow::anyhow!("library {} is not found", lib_raw_name))?;
+        }
+
+        match self.extension_data.get(real_ext_name) {
+            Some(ext_data) => Ok((
+                real_ext_name.to_string(),
+                RemotePath::from_string(&ext_data.archive_path)?,
+            )),
+            None => Err(anyhow::anyhow!(
+                "real_ext_name {} is not found",
+                real_ext_name
+            )),
+        }
+    }
 }

 #[serde_as]
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
@@ -205,5 +205,43 @@
            "name": "zenith new",
            "new_name": "zenith \"new\""
        }
-    ]
+    ],
+    "remote_extensions": {
+        "library_index": {
+          "anon": "anon",
+          "postgis-3": "postgis",
+          "libpgrouting-3.4": "postgis",
+          "postgis_raster-3": "postgis",
+          "postgis_sfcgal-3": "postgis",
+          "postgis_topology-3": "postgis",
+          "address_standardizer-3": "postgis"
+        },
+        "extension_data": {
+          "anon": {
+            "archive_path": "5834329303/v15/extensions/anon.tar.zst",
+            "control_data": {
+              "anon.control": "# PostgreSQL Anonymizer (anon) extension\ncomment = ''Data anonymization tools''\ndefault_version = ''1.1.0''\ndirectory=''extension/anon''\nrelocatable = false\nrequires = ''pgcrypto''\nsuperuser = false\nmodule_pathname = ''$libdir/anon''\ntrusted = true\n"
+            }
+          },
+          "postgis": {
+            "archive_path": "5834329303/v15/extensions/postgis.tar.zst",
+            "control_data": {
+              "postgis.control": "# postgis extension\ncomment = ''PostGIS geometry and geography spatial types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis-3''\nrelocatable = false\ntrusted = true\n",
+              "pgrouting.control": "# pgRouting Extension\ncomment = ''pgRouting Extension''\ndefault_version = ''3.4.2''\nmodule_pathname = ''$libdir/libpgrouting-3.4''\nrelocatable = true\nrequires = ''plpgsql''\nrequires = ''postgis''\ntrusted = true\n",
+              "postgis_raster.control": "# postgis_raster extension\ncomment = ''PostGIS raster types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis_raster-3''\nrelocatable = false\nrequires = postgis\ntrusted = true\n",
+              "postgis_sfcgal.control": "# postgis topology extension\ncomment = ''PostGIS SFCGAL functions''\ndefault_version = ''3.3.2''\nrelocatable = true\nrequires = postgis\ntrusted = true\n",
+              "postgis_topology.control": "# postgis topology extension\ncomment = ''PostGIS topology spatial types and functions''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = topology\nrequires = postgis\ntrusted = true\n",
+              "address_standardizer.control": "# address_standardizer extension\ncomment = ''Used to parse an address into constituent elements. Generally used to support geocoding address normalization step.''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n",
+              "postgis_tiger_geocoder.control": "# postgis tiger geocoder extension\ncomment = ''PostGIS tiger geocoder and reverse geocoder''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = tiger\nrequires = ''postgis,fuzzystrmatch''\nsuperuser= false\ntrusted = true\n",
+              "address_standardizer_data_us.control": "# address standardizer us dataset\ncomment = ''Address Standardizer US dataset example''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n"
+            }
+          }
+        },
+        "custom_extensions": [
+          "anon"
+        ],
+        "public_extensions": [
+          "postgis"
+        ]
+      }
 }
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::Serialize;

-#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
+#[derive(Serialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
 #[serde(tag = "type")]
 pub enum EventType {
    #[serde(rename = "absolute")]
@@ -17,6 +17,32 @@ pub enum EventType {
    },
 }

+impl EventType {
+    pub fn absolute_time(&self) -> Option<&DateTime<Utc>> {
+        use EventType::*;
+        match self {
+            Absolute { time } => Some(time),
+            _ => None,
+        }
+    }
+
+    pub fn incremental_timerange(&self) -> Option<std::ops::Range<&DateTime<Utc>>> {
+        // these can most likely be thought of as Range or RangeFull
+        use EventType::*;
+        match self {
+            Incremental {
+                start_time,
+                stop_time,
+            } => Some(start_time..stop_time),
+            _ => None,
+        }
+    }
+
+    pub fn is_incremental(&self) -> bool {
+        matches!(self, EventType::Incremental { .. })
+    }
+}
+
 #[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
 pub struct Event<Extra> {
    #[serde(flatten)]
@@ -31,7 +57,7 @@ pub struct Event<Extra> {
    pub extra: Extra,
 }

-pub fn idempotency_key(node_id: String) -> String {
+pub fn idempotency_key(node_id: &str) -> String {
    format!(
        "{}-{}-{:04}",
        Utc::now(),
@@ -45,6 +71,6 @@ pub const CHUNK_SIZE: usize = 1000;
 // Just a wrapper around a slice of events
 // to serialize it as `{"events" : [ ] }
 #[derive(serde::Serialize)]
-pub struct EventChunk<'a, T> {
-    pub events: &'a [T],
+pub struct EventChunk<'a, T: Clone> {
+    pub events: std::borrow::Cow<'a, [T]>,
 }
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -20,6 +20,7 @@ tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
 tokio-util.workspace = true
 toml_edit.workspace = true
 tracing.workspace = true
+scopeguard.workspace = true
 metrics.workspace = true
 utils.workspace = true
 pin-project-lite.workspace = true
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -65,6 +65,10 @@ impl RemotePath {
        Ok(Self(relative_path.to_path_buf()))
    }

+    pub fn from_string(relative_path: &str) -> anyhow::Result<Self> {
+        Self::new(Path::new(relative_path))
+    }
+
    pub fn with_base(&self, base_path: &Path) -> PathBuf {
        base_path.join(&self.0)
    }
@@ -190,6 +194,20 @@ pub enum GenericRemoteStorage {
 }

 impl GenericRemoteStorage {
+    // A function for listing all the files in a "directory"
+    // Example:
+    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
+    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        match self {
+            Self::LocalFs(s) => s.list_files(folder).await,
+            Self::AwsS3(s) => s.list_files(folder).await,
+            Self::Unreliable(s) => s.list_files(folder).await,
+        }
+    }
+
+    // lists common *prefixes*, if any of files
+    // Example:
+    // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
    pub async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
@@ -201,14 +219,6 @@ impl GenericRemoteStorage {
        }
    }

-    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        match self {
-            Self::LocalFs(s) => s.list_files(folder).await,
-            Self::AwsS3(s) => s.list_files(folder).await,
-            Self::Unreliable(s) => s.list_files(folder).await,
-        }
-    }
-
    pub async fn upload(
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -10,6 +10,7 @@ use anyhow::Context;
 use aws_config::{
    environment::credentials::EnvironmentVariableCredentialsProvider,
    imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
+    provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider,
 };
 use aws_credential_types::cache::CredentialsCache;
 use aws_sdk_s3::{
@@ -22,6 +23,7 @@ use aws_sdk_s3::{
 };
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
+use scopeguard::ScopeGuard;
 use tokio::{
    io::{self, AsyncRead},
    sync::Semaphore,
@@ -36,82 +38,9 @@ use crate::{

 const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;

-pub(super) mod metrics {
-    use metrics::{register_int_counter_vec, IntCounterVec};
-    use once_cell::sync::Lazy;
+pub(super) mod metrics;

-    static S3_REQUESTS_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
-        register_int_counter_vec!(
-            "remote_storage_s3_requests_count",
-            "Number of s3 requests of particular type",
-            &["request_type"],
-        )
-        .expect("failed to define a metric")
-    });
-
-    static S3_REQUESTS_FAIL_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
-        register_int_counter_vec!(
-            "remote_storage_s3_failures_count",
-            "Number of failed s3 requests of particular type",
-            &["request_type"],
-        )
-        .expect("failed to define a metric")
-    });
-
-    pub fn inc_get_object() {
-        S3_REQUESTS_COUNT.with_label_values(&["get_object"]).inc();
-    }
-
-    pub fn inc_get_object_fail() {
-        S3_REQUESTS_FAIL_COUNT
-            .with_label_values(&["get_object"])
-            .inc();
-    }
-
-    pub fn inc_put_object() {
-        S3_REQUESTS_COUNT.with_label_values(&["put_object"]).inc();
-    }
-
-    pub fn inc_put_object_fail() {
-        S3_REQUESTS_FAIL_COUNT
-            .with_label_values(&["put_object"])
-            .inc();
-    }
-
-    pub fn inc_delete_object() {
-        S3_REQUESTS_COUNT
-            .with_label_values(&["delete_object"])
-            .inc();
-    }
-
-    pub fn inc_delete_objects(count: u64) {
-        S3_REQUESTS_COUNT
-            .with_label_values(&["delete_object"])
-            .inc_by(count);
-    }
-
-    pub fn inc_delete_object_fail() {
-        S3_REQUESTS_FAIL_COUNT
-            .with_label_values(&["delete_object"])
-            .inc();
-    }
-
-    pub fn inc_delete_objects_fail(count: u64) {
-        S3_REQUESTS_FAIL_COUNT
-            .with_label_values(&["delete_object"])
-            .inc_by(count);
-    }
-
-    pub fn inc_list_objects() {
-        S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc();
-    }
-
-    pub fn inc_list_objects_fail() {
-        S3_REQUESTS_FAIL_COUNT
-            .with_label_values(&["list_objects"])
-            .inc();
-    }
-}
+use self::metrics::{AttemptOutcome, RequestKind};

 /// AWS S3 storage.
 pub struct S3Bucket {
@@ -139,18 +68,29 @@ impl S3Bucket {
            aws_config.bucket_name
        );

+        let region = Some(Region::new(aws_config.bucket_region.clone()));
+
        let credentials_provider = {
            // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
            CredentialsProviderChain::first_try(
                "env",
                EnvironmentVariableCredentialsProvider::new(),
            )
+            // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
+            // needed to access remote extensions bucket
+            .or_else("token", {
+                let provider_conf = ProviderConfig::without_region().with_region(region.clone());
+
+                WebIdentityTokenCredentialsProvider::builder()
+                    .configure(&provider_conf)
+                    .build()
+            })
            // uses imds v2
            .or_else("imds", ImdsCredentialsProvider::builder().build())
        };

        let mut config_builder = Config::builder()
-            .region(Region::new(aws_config.bucket_region.clone()))
+            .region(region)
            .credentials_cache(CredentialsCache::lazy())
            .credentials_provider(credentials_provider);

@@ -200,25 +140,56 @@ impl S3Bucket {
        )
    }

-    fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
-        let mut full_path = self.prefix_in_bucket.clone().unwrap_or_default();
-        for segment in path.0.iter() {
-            full_path.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-            full_path.push_str(segment.to_str().unwrap_or_default());
+    pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
+        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
+        let path_string = path
+            .get_path()
+            .to_string_lossy()
+            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR)
+            .to_string();
+        match &self.prefix_in_bucket {
+            Some(prefix) => prefix.clone() + "/" + &path_string,
+            None => path_string,
        }
-        full_path
    }

-    async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
+    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
+        let started_at = start_counting_cancelled_wait(kind);
+        let permit = self
+            .concurrency_limiter
+            .acquire()
+            .await
+            .expect("semaphore is never closed");
+
+        let started_at = ScopeGuard::into_inner(started_at);
+        metrics::BUCKET_METRICS
+            .wait_seconds
+            .observe_elapsed(kind, started_at);
+
+        permit
+    }
+
+    async fn owned_permit(&self, kind: RequestKind) -> tokio::sync::OwnedSemaphorePermit {
+        let started_at = start_counting_cancelled_wait(kind);
        let permit = self
            .concurrency_limiter
            .clone()
            .acquire_owned()
            .await
-            .context("Concurrency limiter semaphore got closed during S3 download")
-            .map_err(DownloadError::Other)?;
+            .expect("semaphore is never closed");

-        metrics::inc_get_object();
+        let started_at = ScopeGuard::into_inner(started_at);
+        metrics::BUCKET_METRICS
+            .wait_seconds
+            .observe_elapsed(kind, started_at);
+        permit
+    }
+
+    async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
+        let kind = RequestKind::Get;
+        let permit = self.owned_permit(kind).await;
+
+        let started_at = start_measuring_requests(kind);

        let get_object = self
            .client
@@ -229,26 +200,33 @@ impl S3Bucket {
            .send()
            .await;

+        let started_at = ScopeGuard::into_inner(started_at);
+
+        if get_object.is_err() {
+            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                kind,
+                AttemptOutcome::Err,
+                started_at,
+            );
+        }
+
        match get_object {
            Ok(object_output) => {
                let metadata = object_output.metadata().cloned().map(StorageMetadata);
                Ok(Download {
                    metadata,
-                    download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new(
-                        permit,
-                        object_output.body.into_async_read(),
+                    download_stream: Box::pin(io::BufReader::new(TimedDownload::new(
+                        started_at,
+                        RatelimitedAsyncRead::new(permit, object_output.body.into_async_read()),
                    ))),
                })
            }
            Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
                Err(DownloadError::NotFound)
            }
-            Err(e) => {
-                metrics::inc_get_object_fail();
-                Err(DownloadError::Other(anyhow::anyhow!(
-                    "Failed to download S3 object: {e}"
-                )))
-            }
+            Err(e) => Err(DownloadError::Other(
+                anyhow::Error::new(e).context("download s3 object"),
+            )),
        }
    }
 }
@@ -279,6 +257,54 @@ impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
    }
 }

+pin_project_lite::pin_project! {
+    /// Times and tracks the outcome of the request.
+    struct TimedDownload<S> {
+        started_at: std::time::Instant,
+        outcome: metrics::AttemptOutcome,
+        #[pin]
+        inner: S
+    }
+
+    impl<S> PinnedDrop for TimedDownload<S> {
+        fn drop(mut this: Pin<&mut Self>) {
+            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
+        }
+    }
+}
+
+impl<S: AsyncRead> TimedDownload<S> {
+    fn new(started_at: std::time::Instant, inner: S) -> Self {
+        TimedDownload {
+            started_at,
+            outcome: metrics::AttemptOutcome::Cancelled,
+            inner,
+        }
+    }
+}
+
+impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
+    fn poll_read(
+        self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &mut io::ReadBuf<'_>,
+    ) -> std::task::Poll<std::io::Result<()>> {
+        let this = self.project();
+        let before = buf.filled().len();
+        let read = std::task::ready!(this.inner.poll_read(cx, buf));
+
+        let read_eof = buf.filled().len() == before;
+
+        match read {
+            Ok(()) if read_eof => *this.outcome = AttemptOutcome::Ok,
+            Ok(()) => { /* still in progress */ }
+            Err(_) => *this.outcome = AttemptOutcome::Err,
+        }
+
+        std::task::Poll::Ready(read)
+    }
+}
+
 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
    /// See the doc for `RemoteStorage::list_prefixes`
@@ -287,6 +313,8 @@ impl RemoteStorage for S3Bucket {
        &self,
        prefix: Option<&RemotePath>,
    ) -> Result<Vec<RemotePath>, DownloadError> {
+        let kind = RequestKind::List;
+
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
            .map(|p| self.relative_path_to_s3_object(p))
@@ -303,15 +331,10 @@ impl RemoteStorage for S3Bucket {
        let mut document_keys = Vec::new();

        let mut continuation_token = None;
-        loop {
-            let _guard = self
-                .concurrency_limiter
-                .acquire()
-                .await
-                .context("Concurrency limiter semaphore got closed during S3 list")
-                .map_err(DownloadError::Other)?;

-            metrics::inc_list_objects();
+        loop {
+            let _guard = self.permit(kind).await;
+            let started_at = start_measuring_requests(kind);

            let fetch_response = self
                .client
@@ -323,12 +346,16 @@ impl RemoteStorage for S3Bucket {
                .set_max_keys(self.max_keys_per_list_response)
                .send()
                .await
-                .map_err(|e| {
-                    metrics::inc_list_objects_fail();
-                    e
-                })
                .context("Failed to list S3 prefixes")
-                .map_err(DownloadError::Other)?;
+                .map_err(DownloadError::Other);
+
+            let started_at = ScopeGuard::into_inner(started_at);
+
+            metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, &fetch_response, started_at);
+
+            let fetch_response = fetch_response?;

            document_keys.extend(
                fetch_response
@@ -338,10 +365,10 @@ impl RemoteStorage for S3Bucket {
                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
            );

-            match fetch_response.next_continuation_token {
-                Some(new_token) => continuation_token = Some(new_token),
+            continuation_token = match fetch_response.next_continuation_token {
+                Some(new_token) => Some(new_token),
                None => break,
-            }
+            };
        }

        Ok(document_keys)
@@ -349,6 +376,8 @@ impl RemoteStorage for S3Bucket {

    /// See the doc for `RemoteStorage::list_files`
    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        let kind = RequestKind::List;
+
        let folder_name = folder
            .map(|p| self.relative_path_to_s3_object(p))
            .or_else(|| self.prefix_in_bucket.clone());
@@ -357,12 +386,8 @@ impl RemoteStorage for S3Bucket {
        let mut continuation_token = None;
        let mut all_files = vec![];
        loop {
-            let _guard = self
-                .concurrency_limiter
-                .acquire()
-                .await
-                .context("Concurrency limiter semaphore got closed during S3 list_files")?;
-            metrics::inc_list_objects();
+            let _guard = self.permit(kind).await;
+            let started_at = start_measuring_requests(kind);

            let response = self
                .client
@@ -373,11 +398,14 @@ impl RemoteStorage for S3Bucket {
                .set_max_keys(self.max_keys_per_list_response)
                .send()
                .await
-                .map_err(|e| {
-                    metrics::inc_list_objects_fail();
-                    e
-                })
-                .context("Failed to list files in S3 bucket")?;
+                .context("Failed to list files in S3 bucket");
+
+            let started_at = ScopeGuard::into_inner(started_at);
+            metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, &response, started_at);
+
+            let response = response?;

            for object in response.contents().unwrap_or_default() {
                let object_path = object.key().expect("response does not contain a key");
@@ -399,18 +427,16 @@ impl RemoteStorage for S3Bucket {
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
    ) -> anyhow::Result<()> {
-        let _guard = self
-            .concurrency_limiter
-            .acquire()
-            .await
-            .context("Concurrency limiter semaphore got closed during S3 upload")?;
+        let kind = RequestKind::Put;
+        let _guard = self.permit(kind).await;

-        metrics::inc_put_object();
+        let started_at = start_measuring_requests(kind);

        let body = Body::wrap_stream(ReaderStream::new(from));
        let bytes_stream = ByteStream::new(SdkBody::from(body));

-        self.client
+        let res = self
+            .client
            .put_object()
            .bucket(self.bucket_name.clone())
            .key(self.relative_path_to_s3_object(to))
@@ -418,19 +444,25 @@ impl RemoteStorage for S3Bucket {
            .content_length(from_size_bytes.try_into()?)
            .body(bytes_stream)
            .send()
-            .await
-            .map_err(|e| {
-                metrics::inc_put_object_fail();
-                e
-            })?;
+            .await;
+
+        let started_at = ScopeGuard::into_inner(started_at);
+        metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
+
+        res?;
+
        Ok(())
    }

    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+        // if prefix is not none then download file `prefix/from`
+        // if prefix is none then download file `from`
        self.download_object(GetObjectRequest {
            bucket: self.bucket_name.clone(),
            key: self.relative_path_to_s3_object(from),
-            ..GetObjectRequest::default()
+            range: None,
        })
        .await
    }
@@ -457,11 +489,8 @@ impl RemoteStorage for S3Bucket {
        .await
    }
    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
-        let _guard = self
-            .concurrency_limiter
-            .acquire()
-            .await
-            .context("Concurrency limiter semaphore got closed during S3 delete")?;
+        let kind = RequestKind::Delete;
+        let _guard = self.permit(kind).await;

        let mut delete_objects = Vec::with_capacity(paths.len());
        for path in paths {
@@ -472,7 +501,7 @@ impl RemoteStorage for S3Bucket {
        }

        for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
-            metrics::inc_delete_objects(chunk.len() as u64);
+            let started_at = start_measuring_requests(kind);

            let resp = self
                .client
@@ -482,10 +511,17 @@ impl RemoteStorage for S3Bucket {
                .send()
                .await;

+            let started_at = ScopeGuard::into_inner(started_at);
+            metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, &resp, started_at);
+
            match resp {
                Ok(resp) => {
+                    metrics::BUCKET_METRICS
+                        .deleted_objects_total
+                        .inc_by(chunk.len() as u64);
                    if let Some(errors) = resp.errors {
-                        metrics::inc_delete_objects_fail(errors.len() as u64);
                        return Err(anyhow::format_err!(
                            "Failed to delete {} objects",
                            errors.len()
@@ -493,7 +529,6 @@ impl RemoteStorage for S3Bucket {
                    }
                }
                Err(e) => {
-                    metrics::inc_delete_objects_fail(chunk.len() as u64);
                    return Err(e.into());
                }
            }
@@ -502,24 +537,89 @@ impl RemoteStorage for S3Bucket {
    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        let _guard = self
-            .concurrency_limiter
-            .acquire()
-            .await
-            .context("Concurrency limiter semaphore got closed during S3 delete")?;
-
-        metrics::inc_delete_object();
-
-        self.client
-            .delete_object()
-            .bucket(self.bucket_name.clone())
-            .key(self.relative_path_to_s3_object(path))
-            .send()
-            .await
-            .map_err(|e| {
-                metrics::inc_delete_object_fail();
-                e
-            })?;
-        Ok(())
+        let paths = std::array::from_ref(path);
+        self.delete_objects(paths).await
+    }
+}
+
+/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
+fn start_counting_cancelled_wait(
+    kind: RequestKind,
+) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
+    scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
+        metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc()
+    })
+}
+
+/// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`].
+fn start_measuring_requests(
+    kind: RequestKind,
+) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
+    scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
+        metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+            kind,
+            AttemptOutcome::Cancelled,
+            started_at,
+        )
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use std::num::NonZeroUsize;
+    use std::path::Path;
+
+    use crate::{RemotePath, S3Bucket, S3Config};
+
+    #[test]
+    fn relative_path() {
+        let all_paths = vec!["", "some/path", "some/path/"];
+        let all_paths: Vec<RemotePath> = all_paths
+            .iter()
+            .map(|x| RemotePath::new(Path::new(x)).expect("bad path"))
+            .collect();
+        let prefixes = [
+            None,
+            Some(""),
+            Some("test/prefix"),
+            Some("test/prefix/"),
+            Some("/test/prefix/"),
+        ];
+        let expected_outputs = vec![
+            vec!["", "some/path", "some/path"],
+            vec!["/", "/some/path", "/some/path"],
+            vec![
+                "test/prefix/",
+                "test/prefix/some/path",
+                "test/prefix/some/path",
+            ],
+            vec![
+                "test/prefix/",
+                "test/prefix/some/path",
+                "test/prefix/some/path",
+            ],
+            vec![
+                "test/prefix/",
+                "test/prefix/some/path",
+                "test/prefix/some/path",
+            ],
+        ];
+
+        for (prefix_idx, prefix) in prefixes.iter().enumerate() {
+            let config = S3Config {
+                bucket_name: "bucket".to_owned(),
+                bucket_region: "region".to_owned(),
+                prefix_in_bucket: prefix.map(str::to_string),
+                endpoint: None,
+                concurrency_limit: NonZeroUsize::new(100).unwrap(),
+                max_keys_per_list_response: Some(5),
+            };
+            let storage = S3Bucket::new(&config).expect("remote storage init");
+            for (test_path_idx, test_path) in all_paths.iter().enumerate() {
+                let result = storage.relative_path_to_s3_object(test_path);
+                let expected = expected_outputs[prefix_idx][test_path_idx];
+                assert_eq!(result, expected);
+            }
+        }
    }
 }
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -0,0 +1,191 @@
+use metrics::{
+    register_histogram_vec, register_int_counter, register_int_counter_vec, Histogram, IntCounter,
+};
+use once_cell::sync::Lazy;
+
+pub(super) static BUCKET_METRICS: Lazy<BucketMetrics> = Lazy::new(Default::default);
+
+#[derive(Clone, Copy, Debug)]
+pub(super) enum RequestKind {
+    Get = 0,
+    Put = 1,
+    Delete = 2,
+    List = 3,
+}
+
+use RequestKind::*;
+
+impl RequestKind {
+    const fn as_str(&self) -> &'static str {
+        match self {
+            Get => "get_object",
+            Put => "put_object",
+            Delete => "delete_object",
+            List => "list_objects",
+        }
+    }
+    const fn as_index(&self) -> usize {
+        *self as usize
+    }
+}
+
+pub(super) struct RequestTyped<C>([C; 4]);
+
+impl<C> RequestTyped<C> {
+    pub(super) fn get(&self, kind: RequestKind) -> &C {
+        &self.0[kind.as_index()]
+    }
+
+    fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
+        use RequestKind::*;
+        let mut it = [Get, Put, Delete, List].into_iter();
+        let arr = std::array::from_fn::<C, 4, _>(|index| {
+            let next = it.next().unwrap();
+            assert_eq!(index, next.as_index());
+            f(next)
+        });
+
+        if let Some(next) = it.next() {
+            panic!("unexpected {next:?}");
+        }
+
+        RequestTyped(arr)
+    }
+}
+
+impl RequestTyped<Histogram> {
+    pub(super) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
+        self.get(kind).observe(started_at.elapsed().as_secs_f64())
+    }
+}
+
+pub(super) struct PassFailCancelledRequestTyped<C> {
+    success: RequestTyped<C>,
+    fail: RequestTyped<C>,
+    cancelled: RequestTyped<C>,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub(super) enum AttemptOutcome {
+    Ok,
+    Err,
+    Cancelled,
+}
+
+impl<T, E> From<&Result<T, E>> for AttemptOutcome {
+    fn from(value: &Result<T, E>) -> Self {
+        match value {
+            Ok(_) => AttemptOutcome::Ok,
+            Err(_) => AttemptOutcome::Err,
+        }
+    }
+}
+
+impl AttemptOutcome {
+    pub(super) fn as_str(&self) -> &'static str {
+        match self {
+            AttemptOutcome::Ok => "ok",
+            AttemptOutcome::Err => "err",
+            AttemptOutcome::Cancelled => "cancelled",
+        }
+    }
+}
+
+impl<C> PassFailCancelledRequestTyped<C> {
+    pub(super) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
+        let target = match outcome {
+            AttemptOutcome::Ok => &self.success,
+            AttemptOutcome::Err => &self.fail,
+            AttemptOutcome::Cancelled => &self.cancelled,
+        };
+        target.get(kind)
+    }
+
+    fn build_with(mut f: impl FnMut(RequestKind, AttemptOutcome) -> C) -> Self {
+        let success = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Ok));
+        let fail = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Err));
+        let cancelled = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Cancelled));
+
+        PassFailCancelledRequestTyped {
+            success,
+            fail,
+            cancelled,
+        }
+    }
+}
+
+impl PassFailCancelledRequestTyped<Histogram> {
+    pub(super) fn observe_elapsed(
+        &self,
+        kind: RequestKind,
+        outcome: impl Into<AttemptOutcome>,
+        started_at: std::time::Instant,
+    ) {
+        self.get(kind, outcome.into())
+            .observe(started_at.elapsed().as_secs_f64())
+    }
+}
+
+pub(super) struct BucketMetrics {
+    /// Full request duration until successful completion, error or cancellation.
+    pub(super) req_seconds: PassFailCancelledRequestTyped<Histogram>,
+    /// Total amount of seconds waited on queue.
+    pub(super) wait_seconds: RequestTyped<Histogram>,
+
+    /// Track how many semaphore awaits were cancelled per request type.
+    ///
+    /// This is in case cancellations are happening more than expected.
+    pub(super) cancelled_waits: RequestTyped<IntCounter>,
+
+    /// Total amount of deleted objects in batches or single requests.
+    pub(super) deleted_objects_total: IntCounter,
+}
+
+impl Default for BucketMetrics {
+    fn default() -> Self {
+        let buckets = [0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];
+
+        let req_seconds = register_histogram_vec!(
+            "remote_storage_s3_request_seconds",
+            "Seconds to complete a request",
+            &["request_type", "result"],
+            buckets.to_vec(),
+        )
+        .unwrap();
+        let req_seconds = PassFailCancelledRequestTyped::build_with(|kind, outcome| {
+            req_seconds.with_label_values(&[kind.as_str(), outcome.as_str()])
+        });
+
+        let wait_seconds = register_histogram_vec!(
+            "remote_storage_s3_wait_seconds",
+            "Seconds rate limited",
+            &["request_type"],
+            buckets.to_vec(),
+        )
+        .unwrap();
+        let wait_seconds =
+            RequestTyped::build_with(|kind| wait_seconds.with_label_values(&[kind.as_str()]));
+
+        let cancelled_waits = register_int_counter_vec!(
+            "remote_storage_s3_cancelled_waits_total",
+            "Times a semaphore wait has been cancelled per request type",
+            &["request_type"],
+        )
+        .unwrap();
+        let cancelled_waits =
+            RequestTyped::build_with(|kind| cancelled_waits.with_label_values(&[kind.as_str()]));
+
+        let deleted_objects_total = register_int_counter!(
+            "remote_storage_s3_deleted_objects_total",
+            "Amount of deleted objects in total",
+        )
+        .unwrap();
+
+        Self {
+            req_seconds,
+            wait_seconds,
+            cancelled_waits,
+            deleted_objects_total,
+        }
+    }
+}
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -71,6 +71,13 @@ impl UnreliableWrapper {
            }
        }
    }
+
+    async fn delete_inner(&self, path: &RemotePath, attempt: bool) -> anyhow::Result<()> {
+        if attempt {
+            self.attempt(RemoteOp::Delete(path.clone()))?;
+        }
+        self.inner.delete(path).await
+    }
 }

 #[async_trait::async_trait]
@@ -122,15 +129,15 @@ impl RemoteStorage for UnreliableWrapper {
    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        self.attempt(RemoteOp::Delete(path.clone()))?;
-        self.inner.delete(path).await
+        self.delete_inner(path, true).await
    }

    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
        self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?;
        let mut error_counter = 0;
        for path in paths {
-            if (self.delete(path).await).is_err() {
+            // Dont record attempt because it was already recorded above
+            if (self.delete_inner(path, false).await).is_err() {
                error_counter += 1;
            }
        }
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -19,7 +19,7 @@ static LOGGING_DONE: OnceCell<()> = OnceCell::new();

 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";

-const BASE_PREFIX: &str = "test/";
+const BASE_PREFIX: &str = "test";

 /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
 /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -26,6 +26,7 @@ serde_json.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
+tokio-util.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
--- a/libs/utils/src/backoff.rs
+++ b/libs/utils/src/backoff.rs
@@ -0,0 +1,234 @@
+use std::fmt::{Debug, Display};
+
+use futures::Future;
+use tokio_util::sync::CancellationToken;
+
+pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
+pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
+
+pub async fn exponential_backoff(
+    n: u32,
+    base_increment: f64,
+    max_seconds: f64,
+    cancel: &CancellationToken,
+) {
+    let backoff_duration_seconds =
+        exponential_backoff_duration_seconds(n, base_increment, max_seconds);
+    if backoff_duration_seconds > 0.0 {
+        tracing::info!(
+            "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
+        );
+
+        drop(
+            tokio::time::timeout(
+                std::time::Duration::from_secs_f64(backoff_duration_seconds),
+                cancel.cancelled(),
+            )
+            .await,
+        )
+    }
+}
+
+pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
+    if n == 0 {
+        0.0
+    } else {
+        (1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
+    }
+}
+
+/// Configure cancellation for a retried operation: when to cancel (the token), and
+/// what kind of error to return on cancellation
+pub struct Cancel<E, CF>
+where
+    E: Display + Debug + 'static,
+    CF: Fn() -> E,
+{
+    token: CancellationToken,
+    on_cancel: CF,
+}
+
+impl<E, CF> Cancel<E, CF>
+where
+    E: Display + Debug + 'static,
+    CF: Fn() -> E,
+{
+    pub fn new(token: CancellationToken, on_cancel: CF) -> Self {
+        Self { token, on_cancel }
+    }
+}
+
+/// retries passed operation until one of the following conditions are met:
+/// Encountered error is considered as permanent (non-retryable)
+/// Retries have been exhausted.
+/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
+/// When attempts cross `warn_threshold` function starts to emit log warnings.
+/// `description` argument is added to log messages. Its value should identify the `op` is doing
+/// `cancel` argument is required: any time we are looping on retry, we should be using a CancellationToken
+/// to drop out promptly on shutdown.
+pub async fn retry<T, O, F, E, CF>(
+    mut op: O,
+    is_permanent: impl Fn(&E) -> bool,
+    warn_threshold: u32,
+    max_retries: u32,
+    description: &str,
+    cancel: Cancel<E, CF>,
+) -> Result<T, E>
+where
+    // Not std::error::Error because anyhow::Error doesnt implement it.
+    // For context see https://github.com/dtolnay/anyhow/issues/63
+    E: Display + Debug + 'static,
+    O: FnMut() -> F,
+    F: Future<Output = Result<T, E>>,
+    CF: Fn() -> E,
+{
+    let mut attempts = 0;
+    loop {
+        if cancel.token.is_cancelled() {
+            return Err((cancel.on_cancel)());
+        }
+
+        let result = op().await;
+        match result {
+            Ok(_) => {
+                if attempts > 0 {
+                    tracing::info!("{description} succeeded after {attempts} retries");
+                }
+                return result;
+            }
+
+            // These are "permanent" errors that should not be retried.
+            Err(ref e) if is_permanent(e) => {
+                return result;
+            }
+            // Assume that any other failure might be transient, and the operation might
+            // succeed if we just keep trying.
+            Err(err) if attempts < warn_threshold => {
+                tracing::info!("{description} failed, will retry (attempt {attempts}): {err:#}");
+            }
+            Err(err) if attempts < max_retries => {
+                tracing::warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
+            }
+            Err(ref err) => {
+                // Operation failed `max_attempts` times. Time to give up.
+                tracing::warn!(
+                    "{description} still failed after {attempts} retries, giving up: {err:?}"
+                );
+                return result;
+            }
+        }
+        // sleep and retry
+        exponential_backoff(
+            attempts,
+            DEFAULT_BASE_BACKOFF_SECONDS,
+            DEFAULT_MAX_BACKOFF_SECONDS,
+            &cancel.token,
+        )
+        .await;
+        attempts += 1;
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::io;
+
+    use tokio::sync::Mutex;
+
+    use super::*;
+
+    #[test]
+    fn backoff_defaults_produce_growing_backoff_sequence() {
+        let mut current_backoff_value = None;
+
+        for i in 0..10_000 {
+            let new_backoff_value = exponential_backoff_duration_seconds(
+                i,
+                DEFAULT_BASE_BACKOFF_SECONDS,
+                DEFAULT_MAX_BACKOFF_SECONDS,
+            );
+
+            if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
+                assert!(
+                    old_backoff_value <= new_backoff_value,
+                    "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
+                )
+            }
+        }
+
+        assert_eq!(
+            current_backoff_value.expect("Should have produced backoff values to compare"),
+            DEFAULT_MAX_BACKOFF_SECONDS,
+            "Given big enough of retries, backoff should reach its allowed max value"
+        );
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn retry_always_error() {
+        let count = Mutex::new(0);
+        let err_result = retry(
+            || async {
+                *count.lock().await += 1;
+                Result::<(), io::Error>::Err(io::Error::from(io::ErrorKind::Other))
+            },
+            |_e| false,
+            1,
+            1,
+            "work",
+            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
+        )
+        .await;
+
+        assert!(err_result.is_err());
+
+        assert_eq!(*count.lock().await, 2);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn retry_ok_after_err() {
+        let count = Mutex::new(0);
+        retry(
+            || async {
+                let mut locked = count.lock().await;
+                if *locked > 1 {
+                    Ok(())
+                } else {
+                    *locked += 1;
+                    Err(io::Error::from(io::ErrorKind::Other))
+                }
+            },
+            |_e| false,
+            2,
+            2,
+            "work",
+            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
+        )
+        .await
+        .unwrap();
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn dont_retry_permanent_errors() {
+        let count = Mutex::new(0);
+        let _ = retry(
+            || async {
+                let mut locked = count.lock().await;
+                if *locked > 1 {
+                    Ok(())
+                } else {
+                    *locked += 1;
+                    Err(io::Error::from(io::ErrorKind::Other))
+                }
+            },
+            |_e| true,
+            2,
+            2,
+            "work",
+            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
+        )
+        .await
+        .unwrap_err();
+
+        assert_eq!(*count.lock().await, 1);
+    }
+}
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -111,6 +111,10 @@ pub fn fsync(path: &Path) -> io::Result<()> {
        .map_err(|e| io::Error::new(e.kind(), format!("Failed to fsync file {path:?}: {e}")))
 }

+pub async fn fsync_async(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
+    tokio::fs::File::open(path).await?.sync_all().await
+}
+
 #[cfg(test)]
 mod tests {
    use tempfile::tempdir;
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -24,6 +24,20 @@ pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool>
    Ok(dir.next_entry().await?.is_none())
 }

+pub async fn list_dir(path: impl AsRef<Path>) -> anyhow::Result<Vec<String>> {
+    let mut dir = tokio::fs::read_dir(&path)
+        .await
+        .context(format!("read_dir({})", path.as_ref().display()))?;
+
+    let mut content = vec![];
+    while let Some(next) = dir.next_entry().await? {
+        let file_name = next.file_name();
+        content.push(file_name.to_string_lossy().to_string());
+    }
+
+    Ok(content)
+}
+
 pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
    if e.kind() == io::ErrorKind::NotFound {
        Ok(())
@@ -43,7 +57,7 @@ where
 mod test {
    use std::path::PathBuf;

-    use crate::fs_ext::is_directory_empty;
+    use crate::fs_ext::{is_directory_empty, list_dir};

    use super::ignore_absent_files;

@@ -109,4 +123,25 @@ mod test {

        assert!(!file_path.exists());
    }
+
+    #[tokio::test]
+    async fn list_dir_works() {
+        let dir = tempfile::tempdir().unwrap();
+        let dir_path = dir.path();
+
+        assert!(list_dir(dir_path).await.unwrap().is_empty());
+
+        let file_path: PathBuf = dir_path.join("testfile");
+        let _ = std::fs::File::create(&file_path).unwrap();
+
+        assert_eq!(&list_dir(dir_path).await.unwrap(), &["testfile"]);
+
+        let another_dir_path: PathBuf = dir_path.join("testdir");
+        std::fs::create_dir(another_dir_path).unwrap();
+
+        let expected = &["testdir", "testfile"];
+        let mut actual = list_dir(dir_path).await.unwrap();
+        actual.sort();
+        assert_eq!(actual, expected);
+    }
 }
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -1,6 +1,8 @@
 //! `utils` is intended to be a place to put code that is shared
 //! between other crates in this repository.

+pub mod backoff;
+
 /// `Lsn` type implements common tasks on Log Sequence Numbers
 pub mod lsn;
 /// SeqWait allows waiting for a future sequence number to arrive
@@ -66,43 +68,7 @@ pub mod completion;
 /// Reporting utilities
 pub mod error;

-mod failpoint_macro_helpers {
-
-    /// use with fail::cfg("$name", "return(2000)")
-    ///
-    /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
-    /// specified time (in milliseconds). The main difference is that we use async
-    /// tokio sleep function. Another difference is that we print lines to the log,
-    /// which can be useful in tests to check that the failpoint was hit.
-    #[macro_export]
-    macro_rules! failpoint_sleep_millis_async {
-        ($name:literal) => {{
-            // If the failpoint is used with a "return" action, set should_sleep to the
-            // returned value (as string). Otherwise it's set to None.
-            let should_sleep = (|| {
-                ::fail::fail_point!($name, |x| x);
-                ::std::option::Option::None
-            })();
-
-            // Sleep if the action was a returned value
-            if let ::std::option::Option::Some(duration_str) = should_sleep {
-                $crate::failpoint_sleep_helper($name, duration_str).await
-            }
-        }};
-    }
-
-    // Helper function used by the macro. (A function has nicer scoping so we
-    // don't need to decorate everything with "::")
-    pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
-        let millis = duration_str.parse::<u64>().unwrap();
-        let d = std::time::Duration::from_millis(millis);
-
-        tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
-        tokio::time::sleep(d).await;
-        tracing::info!("failpoint {:?}: sleep done", name);
-    }
-}
-pub use failpoint_macro_helpers::failpoint_sleep_helper;
+pub mod sync;

 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
--- a/libs/utils/src/sync.rs
+++ b/libs/utils/src/sync.rs
@@ -0,0 +1 @@
+pub mod heavier_once_cell;
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -0,0 +1,306 @@
+use std::sync::{Arc, Mutex, MutexGuard};
+use tokio::sync::Semaphore;
+
+/// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
+/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
+/// for the duration of initialization.
+///
+/// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
+///
+/// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
+pub struct OnceCell<T> {
+    inner: Mutex<Inner<T>>,
+}
+
+impl<T> Default for OnceCell<T> {
+    /// Create new uninitialized [`OnceCell`].
+    fn default() -> Self {
+        Self {
+            inner: Default::default(),
+        }
+    }
+}
+
+/// Semaphore is the current state:
+/// - open semaphore means the value is `None`, not yet initialized
+/// - closed semaphore means the value has been initialized
+#[derive(Debug)]
+struct Inner<T> {
+    init_semaphore: Arc<Semaphore>,
+    value: Option<T>,
+}
+
+impl<T> Default for Inner<T> {
+    fn default() -> Self {
+        Self {
+            init_semaphore: Arc::new(Semaphore::new(1)),
+            value: None,
+        }
+    }
+}
+
+impl<T> OnceCell<T> {
+    /// Creates an already initialized `OnceCell` with the given value.
+    pub fn new(value: T) -> Self {
+        let sem = Semaphore::new(1);
+        sem.close();
+        Self {
+            inner: Mutex::new(Inner {
+                init_semaphore: Arc::new(sem),
+                value: Some(value),
+            }),
+        }
+    }
+
+    /// Returns a guard to an existing initialized value, or uniquely initializes the value before
+    /// returning the guard.
+    ///
+    /// Initializing might wait on any existing [`Guard::take_and_deinit`] deinitialization.
+    ///
+    /// Initialization is panic-safe and cancellation-safe.
+    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
+    where
+        F: FnOnce() -> Fut,
+        Fut: std::future::Future<Output = Result<T, E>>,
+    {
+        let sem = {
+            let guard = self.inner.lock().unwrap();
+            if guard.value.is_some() {
+                return Ok(Guard(guard));
+            }
+            guard.init_semaphore.clone()
+        };
+
+        let permit = sem.acquire_owned().await;
+        if permit.is_err() {
+            let guard = self.inner.lock().unwrap();
+            assert!(
+                guard.value.is_some(),
+                "semaphore got closed, must be initialized"
+            );
+            return Ok(Guard(guard));
+        } else {
+            // now we try
+            let value = factory().await?;
+
+            let mut guard = self.inner.lock().unwrap();
+            assert!(
+                guard.value.is_none(),
+                "we won permit, must not be initialized"
+            );
+            guard.value = Some(value);
+            guard.init_semaphore.close();
+            Ok(Guard(guard))
+        }
+    }
+
+    /// Returns a guard to an existing initialized value, if any.
+    pub fn get(&self) -> Option<Guard<'_, T>> {
+        let guard = self.inner.lock().unwrap();
+        if guard.value.is_some() {
+            Some(Guard(guard))
+        } else {
+            None
+        }
+    }
+}
+
+/// Uninteresting guard object to allow short-lived access to inspect or clone the held,
+/// initialized value.
+#[derive(Debug)]
+pub struct Guard<'a, T>(MutexGuard<'a, Inner<T>>);
+
+impl<T> std::ops::Deref for Guard<'_, T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.0
+            .value
+            .as_ref()
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+impl<T> std::ops::DerefMut for Guard<'_, T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.0
+            .value
+            .as_mut()
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+impl<'a, T> Guard<'a, T> {
+    /// Take the current value, and a new permit for it's deinitialization.
+    ///
+    /// The permit will be on a semaphore part of the new internal value, and any following
+    /// [`OnceCell::get_or_init`] will wait on it to complete.
+    pub fn take_and_deinit(&mut self) -> (T, tokio::sync::OwnedSemaphorePermit) {
+        let mut swapped = Inner::default();
+        let permit = swapped
+            .init_semaphore
+            .clone()
+            .try_acquire_owned()
+            .expect("we just created this");
+        std::mem::swap(&mut *self.0, &mut swapped);
+        swapped
+            .value
+            .map(|v| (v, permit))
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::{
+        convert::Infallible,
+        sync::atomic::{AtomicUsize, Ordering},
+        time::Duration,
+    };
+
+    #[tokio::test]
+    async fn many_initializers() {
+        #[derive(Default, Debug)]
+        struct Counters {
+            factory_got_to_run: AtomicUsize,
+            future_polled: AtomicUsize,
+            winners: AtomicUsize,
+        }
+
+        let initializers = 100;
+
+        let cell = Arc::new(OnceCell::default());
+        let counters = Arc::new(Counters::default());
+        let barrier = Arc::new(tokio::sync::Barrier::new(initializers + 1));
+
+        let mut js = tokio::task::JoinSet::new();
+        for i in 0..initializers {
+            js.spawn({
+                let cell = cell.clone();
+                let counters = counters.clone();
+                let barrier = barrier.clone();
+
+                async move {
+                    barrier.wait().await;
+                    let won = {
+                        let g = cell
+                            .get_or_init(|| {
+                                counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
+                                async {
+                                    counters.future_polled.fetch_add(1, Ordering::Relaxed);
+                                    Ok::<_, Infallible>(i)
+                                }
+                            })
+                            .await
+                            .unwrap();
+
+                        *g == i
+                    };
+
+                    if won {
+                        counters.winners.fetch_add(1, Ordering::Relaxed);
+                    }
+                }
+            });
+        }
+
+        barrier.wait().await;
+
+        while let Some(next) = js.join_next().await {
+            next.expect("no panics expected");
+        }
+
+        let mut counters = Arc::try_unwrap(counters).unwrap();
+
+        assert_eq!(*counters.factory_got_to_run.get_mut(), 1);
+        assert_eq!(*counters.future_polled.get_mut(), 1);
+        assert_eq!(*counters.winners.get_mut(), 1);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn reinit_waits_for_deinit() {
+        // with he tokio::time paused, we will "sleep" for 1s while holding the reinitialization
+        let sleep_for = Duration::from_secs(1);
+        let initial = 42;
+        let reinit = 1;
+        let cell = Arc::new(OnceCell::new(initial));
+
+        let deinitialization_started = Arc::new(tokio::sync::Barrier::new(2));
+
+        let jh = tokio::spawn({
+            let cell = cell.clone();
+            let deinitialization_started = deinitialization_started.clone();
+            async move {
+                let (answer, _permit) = cell.get().expect("initialized to value").take_and_deinit();
+                assert_eq!(answer, initial);
+
+                deinitialization_started.wait().await;
+                tokio::time::sleep(sleep_for).await;
+            }
+        });
+
+        deinitialization_started.wait().await;
+
+        let started_at = tokio::time::Instant::now();
+        cell.get_or_init(|| async { Ok::<_, Infallible>(reinit) })
+            .await
+            .unwrap();
+
+        let elapsed = started_at.elapsed();
+        assert!(
+            elapsed >= sleep_for,
+            "initialization should had taken at least the time time slept with permit"
+        );
+
+        jh.await.unwrap();
+
+        assert_eq!(*cell.get().unwrap(), reinit);
+    }
+
+    #[tokio::test]
+    async fn initialization_attemptable_until_ok() {
+        let cell = OnceCell::default();
+
+        for _ in 0..10 {
+            cell.get_or_init(|| async { Err("whatever error") })
+                .await
+                .unwrap_err();
+        }
+
+        let g = cell
+            .get_or_init(|| async { Ok::<_, Infallible>("finally success") })
+            .await
+            .unwrap();
+        assert_eq!(*g, "finally success");
+    }
+
+    #[tokio::test]
+    async fn initialization_is_cancellation_safe() {
+        let cell = OnceCell::default();
+
+        let barrier = tokio::sync::Barrier::new(2);
+
+        let initializer = cell.get_or_init(|| async {
+            barrier.wait().await;
+            futures::future::pending::<()>().await;
+
+            Ok::<_, Infallible>("never reached")
+        });
+
+        tokio::select! {
+            _ = initializer => { unreachable!("cannot complete; stuck in pending().await") },
+            _ = barrier.wait() => {}
+        };
+
+        // now initializer is dropped
+
+        assert!(cell.get().is_none());
+
+        let g = cell
+            .get_or_init(|| async { Ok::<_, Infallible>("now initialized") })
+            .await
+            .unwrap();
+        assert_eq!(*g, "now initialized");
+    }
+}
--- a/libs/vm_monitor/Cargo.toml
+++ b/libs/vm_monitor/Cargo.toml
@@ -0,0 +1,31 @@
+[package]
+name = "vm_monitor"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[[bin]]
+name = "vm-monitor"
+path = "./src/bin/monitor.rs"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+anyhow.workspace = true
+axum.workspace = true
+clap.workspace = true
+futures.workspace = true
+inotify.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+sysinfo.workspace = true
+tokio.workspace = true
+tokio-postgres.workspace = true
+tokio-stream.workspace = true
+tokio-util.workspace = true
+tracing.workspace = true
+tracing-subscriber.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+
+[target.'cfg(target_os = "linux")'.dependencies]
+cgroups-rs = "0.3.3"
--- a/libs/vm_monitor/README.md
+++ b/libs/vm_monitor/README.md
@@ -0,0 +1,18 @@
+# `vm-monitor`
+
+The `vm-monitor` (or just monitor) is a core component of the autoscaling system,
+along with the `autoscale-scheduler` and the `autoscaler-agent`s. The monitor has
+two primary roles: 1) notifying agents when immediate upscaling is necessary due
+to memory conditions and 2) managing Postgres' file cache and a cgroup to carry
+out upscaling and downscaling decisions.
+
+## More on scaling
+
+We scale CPU and memory using NeonVM, our in-house QEMU tool for use with Kubernetes.
+To control thresholds for receiving memory usage notifications, we start Postgres
+in the `neon-postgres` cgroup and set its `memory.{max,high}`.
+
+* See also: [`neondatabase/autoscaling`](https://github.com/neondatabase/autoscaling/)
+* See also: [`neondatabase/vm-monitor`](https://github.com/neondatabase/vm-monitor/),
+where initial development of the monitor happened. The repository is no longer
+maintained but the commit history may be useful for debugging.
--- a/libs/vm_monitor/src/bin/monitor.rs
+++ b/libs/vm_monitor/src/bin/monitor.rs
@@ -0,0 +1,33 @@
+// We expose a standalone binary _and_ start the monitor in `compute_ctl` so that
+// we can test the monitor as part of the entire autoscaling system in
+// neondatabase/autoscaling.
+//
+// The monitor was previously started by vm-builder, and for testing purposes,
+// we can mimic that setup with this binary.
+
+#[cfg(target_os = "linux")]
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    use clap::Parser;
+    use tokio_util::sync::CancellationToken;
+    use tracing_subscriber::EnvFilter;
+    use vm_monitor::Args;
+
+    let subscriber = tracing_subscriber::fmt::Subscriber::builder()
+        .json()
+        .with_file(true)
+        .with_line_number(true)
+        .with_span_list(true)
+        .with_env_filter(EnvFilter::from_default_env())
+        .finish();
+    tracing::subscriber::set_global_default(subscriber)?;
+
+    let args: &'static Args = Box::leak(Box::new(Args::parse()));
+    let token = CancellationToken::new();
+    vm_monitor::start(args, token).await
+}
+
+#[cfg(not(target_os = "linux"))]
+fn main() {
+    panic!("the monitor requires cgroups, which are only available on linux")
+}
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -0,0 +1,693 @@
+use std::{
+    fmt::{Debug, Display},
+    fs,
+    pin::pin,
+    sync::atomic::{AtomicU64, Ordering},
+};
+
+use anyhow::{anyhow, bail, Context};
+use cgroups_rs::{
+    freezer::FreezerController,
+    hierarchies::{self, is_cgroup2_unified_mode, UNIFIED_MOUNTPOINT},
+    memory::MemController,
+    MaxValue,
+    Subsystem::{Freezer, Mem},
+};
+use inotify::{EventStream, Inotify, WatchMask};
+use tokio::sync::mpsc::{self, error::TryRecvError};
+use tokio::time::{Duration, Instant};
+use tokio_stream::{Stream, StreamExt};
+use tracing::{info, warn};
+
+use crate::protocol::Resources;
+use crate::MiB;
+
+/// Monotonically increasing counter of the number of memory.high events
+/// the cgroup has experienced.
+///
+/// We use this to determine if a modification to the `memory.events` file actually
+/// changed the `high` field. If not, we don't care about the change. When we
+/// read the file, we check the `high` field in the file against `MEMORY_EVENT_COUNT`
+/// to see if it changed since last time.
+pub static MEMORY_EVENT_COUNT: AtomicU64 = AtomicU64::new(0);
+
+/// Monotonically increasing counter that gives each cgroup event a unique id.
+///
+/// This allows us to answer questions like "did this upscale arrive before this
+/// memory.high?". This static is also used by the `Sequenced` type to "tag" values
+/// with a sequence number. As such, prefer to used the `Sequenced` type rather
+/// than this static directly.
+static EVENT_SEQUENCE_NUMBER: AtomicU64 = AtomicU64::new(0);
+
+/// A memory event type reported in memory.events.
+#[derive(Debug, Eq, PartialEq, Copy, Clone)]
+pub enum MemoryEvent {
+    Low,
+    High,
+    Max,
+    Oom,
+    OomKill,
+    OomGroupKill,
+}
+
+impl MemoryEvent {
+    fn as_str(&self) -> &str {
+        match self {
+            MemoryEvent::Low => "low",
+            MemoryEvent::High => "high",
+            MemoryEvent::Max => "max",
+            MemoryEvent::Oom => "oom",
+            MemoryEvent::OomKill => "oom_kill",
+            MemoryEvent::OomGroupKill => "oom_group_kill",
+        }
+    }
+}
+
+impl Display for MemoryEvent {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+
+/// Configuration for a `CgroupWatcher`
+#[derive(Debug, Clone)]
+pub struct Config {
+    // The target difference between the total memory reserved for the cgroup
+    // and the value of the cgroup's memory.high.
+    //
+    // In other words, memory.high + oom_buffer_bytes will equal the total memory that the cgroup may
+    // use (equal to system memory, minus whatever's taken out for the file cache).
+    oom_buffer_bytes: u64,
+
+    // The amount of memory, in bytes, below a proposed new value for
+    // memory.high that the cgroup's memory usage must be for us to downscale
+    //
+    // In other words, we can downscale only when:
+    //
+    //   memory.current + memory_high_buffer_bytes < (proposed) memory.high
+    //
+    // TODO: there's some minor issues with this approach -- in particular, that we might have
+    // memory in use by the kernel's page cache that we're actually ok with getting rid of.
+    pub(crate) memory_high_buffer_bytes: u64,
+
+    // The maximum duration, in milliseconds, that we're allowed to pause
+    // the cgroup for while waiting for the autoscaler-agent to upscale us
+    max_upscale_wait: Duration,
+
+    // The required minimum time, in milliseconds, that we must wait before re-freezing
+    // the cgroup while waiting for the autoscaler-agent to upscale us.
+    do_not_freeze_more_often_than: Duration,
+
+    // The amount of memory, in bytes, that we should periodically increase memory.high
+    // by while waiting for the autoscaler-agent to upscale us.
+    //
+    // This exists to avoid the excessive throttling that happens when a cgroup is above its
+    // memory.high for too long. See more here:
+    // https://github.com/neondatabase/autoscaling/issues/44#issuecomment-1522487217
+    memory_high_increase_by_bytes: u64,
+
+    // The period, in milliseconds, at which we should repeatedly increase the value
+    // of the cgroup's memory.high while we're waiting on upscaling and memory.high
+    // is still being hit.
+    //
+    // Technically speaking, this actually serves as a rate limit to moderate responding to
+    // memory.high events, but these are roughly equivalent if the process is still allocating
+    // memory.
+    memory_high_increase_every: Duration,
+}
+
+impl Config {
+    /// Calculate the new value for the cgroups memory.high based on system memory
+    pub fn calculate_memory_high_value(&self, total_system_mem: u64) -> u64 {
+        total_system_mem.saturating_sub(self.oom_buffer_bytes)
+    }
+}
+
+impl Default for Config {
+    fn default() -> Self {
+        Self {
+            oom_buffer_bytes: 100 * MiB,
+            memory_high_buffer_bytes: 100 * MiB,
+            // while waiting for upscale, don't freeze for more than 20ms every 1s
+            max_upscale_wait: Duration::from_millis(20),
+            do_not_freeze_more_often_than: Duration::from_millis(1000),
+            // while waiting for upscale, increase memory.high by 10MiB every 25ms
+            memory_high_increase_by_bytes: 10 * MiB,
+            memory_high_increase_every: Duration::from_millis(25),
+        }
+    }
+}
+
+/// Used to represent data that is associated with a certain point in time, such
+/// as an upscale request or memory.high event.
+///
+/// Internally, creating a `Sequenced` uses a static atomic counter to obtain
+/// a unique sequence number. Sequence numbers are monotonically increasing,
+/// allowing us to answer questions like "did this upscale happen after this
+/// memory.high event?" by comparing the sequence numbers of the two events.
+#[derive(Debug, Clone)]
+pub struct Sequenced<T> {
+    seqnum: u64,
+    data: T,
+}
+
+impl<T> Sequenced<T> {
+    pub fn new(data: T) -> Self {
+        Self {
+            seqnum: EVENT_SEQUENCE_NUMBER.fetch_add(1, Ordering::AcqRel),
+            data,
+        }
+    }
+}
+
+/// Responds to `MonitorEvents` to manage the cgroup: preventing it from being
+/// OOM killed or throttling.
+///
+/// The `CgroupWatcher` primarily achieves this by reading from a stream of
+/// `MonitorEvent`s. See `main_signals_loop` for details on how to keep the
+/// cgroup happy.
+#[derive(Debug)]
+pub struct CgroupWatcher {
+    pub config: Config,
+
+    /// The sequence number of the last upscale.
+    ///
+    /// If we receive a memory.high event that has a _lower_ sequence number than
+    /// `last_upscale_seqnum`, then we know it occured before the upscale, and we
+    /// can safely ignore it.
+    ///
+    /// Note: Like the `events` field, this doesn't _need_ interior mutability but we
+    /// use it anyways so that methods take `&self`, not `&mut self`.
+    last_upscale_seqnum: AtomicU64,
+
+    /// A channel on which we send messages to request upscale from the dispatcher.
+    upscale_requester: mpsc::Sender<()>,
+
+    /// The actual cgroup we are watching and managing.
+    cgroup: cgroups_rs::Cgroup,
+}
+
+/// Read memory.events for the desired event type.
+///
+/// `path` specifies the path to the desired `memory.events` file.
+/// For more info, see the `memory.events` section of the [kernel docs]
+/// <https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files>
+fn get_event_count(path: &str, event: MemoryEvent) -> anyhow::Result<u64> {
+    let contents = fs::read_to_string(path)
+        .with_context(|| format!("failed to read memory.events from {path}"))?;
+
+    // Then contents of the file look like:
+    // low 42
+    // high 101
+    // ...
+    contents
+        .lines()
+        .filter_map(|s| s.split_once(' '))
+        .find(|(e, _)| *e == event.as_str())
+        .ok_or_else(|| anyhow!("failed to find entry for memory.{event} events in {path}"))
+        .and_then(|(_, count)| {
+            count
+                .parse::<u64>()
+                .with_context(|| format!("failed to parse memory.{event} as u64"))
+        })
+}
+
+/// Create an event stream that produces events whenever the file at the provided
+/// path is modified.
+fn create_file_watcher(path: &str) -> anyhow::Result<EventStream<[u8; 1024]>> {
+    info!("creating file watcher for {path}");
+    let inotify = Inotify::init().context("failed to initialize file watcher")?;
+    inotify
+        .watches()
+        .add(path, WatchMask::MODIFY)
+        .with_context(|| format!("failed to start watching {path}"))?;
+    inotify
+        // The inotify docs use [0u8; 1024] so we'll just copy them. We only need
+        // to store one event at a time - if the event gets written over, that's
+        // ok. We still see that there is an event. For more information, see:
+        // https://man7.org/linux/man-pages/man7/inotify.7.html
+        .into_event_stream([0u8; 1024])
+        .context("failed to start inotify event stream")
+}
+
+impl CgroupWatcher {
+    /// Create a new `CgroupWatcher`.
+    #[tracing::instrument(skip_all, fields(%name))]
+    pub fn new(
+        name: String,
+        // A channel on which to send upscale requests
+        upscale_requester: mpsc::Sender<()>,
+    ) -> anyhow::Result<(Self, impl Stream<Item = Sequenced<u64>>)> {
+        // TODO: clarify exactly why we need v2
+        // Make sure cgroups v2 (aka unified) are supported
+        if !is_cgroup2_unified_mode() {
+            anyhow::bail!("cgroups v2 not supported");
+        }
+        let cgroup = cgroups_rs::Cgroup::load(hierarchies::auto(), &name);
+
+        // Start monitoring the cgroup for memory events. In general, for
+        // cgroups v2 (aka unified), metrics are reported in files like
+        // > `/sys/fs/cgroup/{name}/{metric}`
+        // We are looking for `memory.high` events, which are stored in the
+        // file `memory.events`. For more info, see the `memory.events` section
+        // of https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files
+        let path = format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name);
+        let memory_events = create_file_watcher(&path)
+            .with_context(|| format!("failed to create event watcher for {path}"))?
+            // This would be nice with with .inspect_err followed by .ok
+            .filter_map(move |_| match get_event_count(&path, MemoryEvent::High) {
+                Ok(high) => Some(high),
+                Err(error) => {
+                    // TODO: Might want to just panic here
+                    warn!(?error, "failed to read high events count from {}", &path);
+                    None
+                }
+            })
+            // Only report the event if the memory.high count increased
+            .filter_map(|high| {
+                if MEMORY_EVENT_COUNT.fetch_max(high, Ordering::AcqRel) < high {
+                    Some(high)
+                } else {
+                    None
+                }
+            })
+            .map(Sequenced::new);
+
+        let initial_count = get_event_count(
+            &format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name),
+            MemoryEvent::High,
+        )?;
+
+        info!(initial_count, "initial memory.high event count");
+
+        // Hard update `MEMORY_EVENT_COUNT` since there could have been processes
+        // running in the cgroup before that caused it to be non-zero.
+        MEMORY_EVENT_COUNT.fetch_max(initial_count, Ordering::AcqRel);
+
+        Ok((
+            Self {
+                cgroup,
+                upscale_requester,
+                last_upscale_seqnum: AtomicU64::new(0),
+                config: Default::default(),
+            },
+            memory_events,
+        ))
+    }
+
+    /// The entrypoint for the `CgroupWatcher`.
+    #[tracing::instrument(skip_all)]
+    pub async fn watch<E>(
+        &self,
+        // These are ~dependency injected~ (fancy, I know) because this function
+        // should never return.
+        // -> therefore: when we tokio::spawn it, we don't await the JoinHandle.
+        // -> therefore: if we want to stick it in an Arc so many threads can access
+        //    it, methods can never take mutable access.
+        //     - note: we use the Arc strategy so that a) we can call this function
+        //             right here and b) the runner can call the set/get_memory methods
+        // -> since calling recv() on a tokio::sync::mpsc::Receiver takes &mut self,
+        //    we just pass them in here instead of holding them in fields, as that
+        //    would require this method to take &mut self.
+        mut upscales: mpsc::Receiver<Sequenced<Resources>>,
+        events: E,
+    ) -> anyhow::Result<()>
+    where
+        E: Stream<Item = Sequenced<u64>>,
+    {
+        // There are several actions might do when receiving a `memory.high`,
+        // such as freezing the cgroup, or increasing its `memory.high`. We don't
+        // want to do these things too often (because postgres needs to run, and
+        // we only have so much memory). These timers serve as rate limits for this.
+        let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO));
+        let mut wait_to_increase_memory_high = pin!(tokio::time::sleep(Duration::ZERO));
+        let mut events = pin!(events);
+
+        // Are we waiting to be upscaled? Could be true if we request upscale due
+        // to a memory.high event and it does not arrive in time.
+        let mut waiting_on_upscale = false;
+
+        loop {
+            tokio::select! {
+                upscale = upscales.recv() => {
+                    let Sequenced { seqnum, data } = upscale
+                        .context("failed to listen on upscale notification channel")?;
+                    self.last_upscale_seqnum.store(seqnum, Ordering::Release);
+                    info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
+                }
+                event = events.next() => {
+                    let Some(Sequenced { seqnum, .. }) = event else {
+                        bail!("failed to listen for memory.high events")
+                    };
+                    // The memory.high came before our last upscale, so we consider
+                    // it resolved
+                    if self.last_upscale_seqnum.fetch_max(seqnum, Ordering::AcqRel) > seqnum {
+                        info!(
+                            "received memory.high event, but it came before our last upscale -> ignoring it"
+                        );
+                        continue;
+                    }
+
+                    // The memory.high came after our latest upscale. We don't
+                    // want to do anything yet, so peek the next event in hopes
+                    // that it's an upscale.
+                    if let Some(upscale_num) = self
+                        .upscaled(&mut upscales)
+                        .context("failed to check if we were upscaled")?
+                    {
+                        if upscale_num > seqnum {
+                            info!(
+                                "received memory.high event, but it came before our last upscale -> ignoring it"
+                            );
+                            continue;
+                        }
+                    }
+
+                    // If it's been long enough since we last froze, freeze the
+                    // cgroup and request upscale
+                    if wait_to_freeze.is_elapsed() {
+                        info!("received memory.high event -> requesting upscale");
+                        waiting_on_upscale = self
+                            .handle_memory_high_event(&mut upscales)
+                            .await
+                            .context("failed to handle upscale")?;
+                        wait_to_freeze
+                            .as_mut()
+                            .reset(Instant::now() + self.config.do_not_freeze_more_often_than);
+                        continue;
+                    }
+
+                    // Ok, we can't freeze, just request upscale
+                    if !waiting_on_upscale {
+                        info!("received memory.high event, but too soon to refreeze -> requesting upscale");
+
+                        // Make check to make sure we haven't been upscaled in the
+                        // meantine (can happen if the agent independently decides
+                        // to upscale us again)
+                        if self
+                            .upscaled(&mut upscales)
+                            .context("failed to check if we were upscaled")?
+                            .is_some()
+                        {
+                            info!("no need to request upscaling because we got upscaled");
+                            continue;
+                        }
+                        self.upscale_requester
+                            .send(())
+                            .await
+                            .context("failed to request upscale")?;
+                        continue;
+                    }
+
+                    // Shoot, we can't freeze or and we're still waiting on upscale,
+                    // increase memory.high to reduce throttling
+                    if wait_to_increase_memory_high.is_elapsed() {
+                        info!(
+                            "received memory.high event, \
+                            but too soon to refreeze and already requested upscale \
+                            -> increasing memory.high"
+                        );
+
+                        // Make check to make sure we haven't been upscaled in the
+                        // meantine (can happen if the agent independently decides
+                        // to upscale us again)
+                        if self
+                            .upscaled(&mut upscales)
+                            .context("failed to check if we were upscaled")?
+                            .is_some()
+                        {
+                            info!("no need to increase memory.high because got upscaled");
+                            continue;
+                        }
+
+                        // Request upscale anyways (the agent will handle deduplicating
+                        // requests)
+                        self.upscale_requester
+                            .send(())
+                            .await
+                            .context("failed to request upscale")?;
+
+                        let memory_high =
+                            self.get_high_bytes().context("failed to get memory.high")?;
+                        let new_high = memory_high + self.config.memory_high_increase_by_bytes;
+                        info!(
+                            current_high_bytes = memory_high,
+                            new_high_bytes = new_high,
+                            "updating memory.high"
+                        );
+                        self.set_high_bytes(new_high)
+                            .context("failed to set memory.high")?;
+                        wait_to_increase_memory_high
+                            .as_mut()
+                            .reset(Instant::now() + self.config.memory_high_increase_every)
+                    }
+
+                    // we can't do anything
+                }
+            };
+        }
+    }
+
+    /// Handle a `memory.high`, returning whether we are still waiting on upscale
+    /// by the time the function returns.
+    ///
+    /// The general plan for handling a `memory.high` event is as follows:
+    /// 1. Freeze the cgroup
+    /// 2. Start a timer for `self.config.max_upscale_wait`
+    /// 3. Request upscale
+    /// 4. After the timer elapses or we receive upscale, thaw the cgroup.
+    /// 5. Return whether or not we are still waiting for upscale. If we are,
+    ///    we'll increase the cgroups memory.high to avoid getting oom killed
+    #[tracing::instrument(skip_all)]
+    async fn handle_memory_high_event(
+        &self,
+        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+    ) -> anyhow::Result<bool> {
+        // Immediately freeze the cgroup before doing anything else.
+        info!("received memory.high event -> freezing cgroup");
+        self.freeze().context("failed to freeze cgroup")?;
+
+        // We'll use this for logging durations
+        let start_time = Instant::now();
+
+        // Await the upscale until we have to unfreeze
+        let timed =
+            tokio::time::timeout(self.config.max_upscale_wait, self.await_upscale(upscales));
+
+        // Request the upscale
+        info!(
+            wait = ?self.config.max_upscale_wait,
+            "sending request for immediate upscaling",
+        );
+        self.upscale_requester
+            .send(())
+            .await
+            .context("failed to request upscale")?;
+
+        let waiting_on_upscale = match timed.await {
+            Ok(Ok(())) => {
+                info!(elapsed = ?start_time.elapsed(), "received upscale in time");
+                false
+            }
+            // **important**: unfreeze the cgroup before ?-reporting the error
+            Ok(Err(e)) => {
+                info!("error waiting for upscale -> thawing cgroup");
+                self.thaw()
+                    .context("failed to thaw cgroup after errored waiting for upscale")?;
+                Err(e.context("failed to await upscale"))?
+            }
+            Err(_) => {
+                info!(elapsed = ?self.config.max_upscale_wait, "timed out waiting for upscale");
+                true
+            }
+        };
+
+        info!("thawing cgroup");
+        self.thaw().context("failed to thaw cgroup")?;
+
+        Ok(waiting_on_upscale)
+    }
+
+    /// Checks whether we were just upscaled, returning the upscale's sequence
+    /// number if so.
+    #[tracing::instrument(skip_all)]
+    fn upscaled(
+        &self,
+        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+    ) -> anyhow::Result<Option<u64>> {
+        let Sequenced { seqnum, data } = match upscales.try_recv() {
+            Ok(upscale) => upscale,
+            Err(TryRecvError::Empty) => return Ok(None),
+            Err(TryRecvError::Disconnected) => {
+                bail!("upscale notification channel was disconnected")
+            }
+        };
+
+        // Make sure to update the last upscale sequence number
+        self.last_upscale_seqnum.store(seqnum, Ordering::Release);
+        info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
+        Ok(Some(seqnum))
+    }
+
+    /// Await an upscale event, discarding any `memory.high` events received in
+    /// the process.
+    ///
+    /// This is used in `handle_memory_high_event`, where we need to listen
+    /// for upscales in particular so we know if we can thaw the cgroup early.
+    #[tracing::instrument(skip_all)]
+    async fn await_upscale(
+        &self,
+        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+    ) -> anyhow::Result<()> {
+        let Sequenced { seqnum, .. } = upscales
+            .recv()
+            .await
+            .context("error listening for upscales")?;
+
+        self.last_upscale_seqnum.store(seqnum, Ordering::Release);
+        Ok(())
+    }
+
+    /// Get the cgroup's name.
+    pub fn path(&self) -> &str {
+        self.cgroup.path()
+    }
+}
+
+/// Represents a set of limits we apply to a cgroup to control memory usage.
+///
+/// Setting these values also affects the thresholds for receiving usage alerts.
+#[derive(Debug)]
+pub struct MemoryLimits {
+    high: u64,
+    max: u64,
+}
+
+impl MemoryLimits {
+    pub fn new(high: u64, max: u64) -> Self {
+        Self { max, high }
+    }
+}
+
+// Methods for manipulating the actual cgroup
+impl CgroupWatcher {
+    /// Get a handle on the freezer subsystem.
+    fn freezer(&self) -> anyhow::Result<&FreezerController> {
+        if let Some(Freezer(freezer)) = self
+            .cgroup
+            .subsystems()
+            .iter()
+            .find(|sub| matches!(sub, Freezer(_)))
+        {
+            Ok(freezer)
+        } else {
+            anyhow::bail!("could not find freezer subsystem")
+        }
+    }
+
+    /// Attempt to freeze the cgroup.
+    pub fn freeze(&self) -> anyhow::Result<()> {
+        self.freezer()
+            .context("failed to get freezer subsystem")?
+            .freeze()
+            .context("failed to freeze")
+    }
+
+    /// Attempt to thaw the cgroup.
+    pub fn thaw(&self) -> anyhow::Result<()> {
+        self.freezer()
+            .context("failed to get freezer subsystem")?
+            .thaw()
+            .context("failed to thaw")
+    }
+
+    /// Get a handle on the memory subsystem.
+    ///
+    /// Note: this method does not require `self.memory_update_lock` because
+    /// getting a handle to the subsystem does not access any of the files we
+    /// care about, such as memory.high and memory.events
+    fn memory(&self) -> anyhow::Result<&MemController> {
+        if let Some(Mem(memory)) = self
+            .cgroup
+            .subsystems()
+            .iter()
+            .find(|sub| matches!(sub, Mem(_)))
+        {
+            Ok(memory)
+        } else {
+            anyhow::bail!("could not find memory subsystem")
+        }
+    }
+
+    /// Get cgroup current memory usage.
+    pub fn current_memory_usage(&self) -> anyhow::Result<u64> {
+        Ok(self
+            .memory()
+            .context("failed to get memory subsystem")?
+            .memory_stat()
+            .usage_in_bytes)
+    }
+
+    /// Set cgroup memory.high threshold.
+    pub fn set_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
+        self.memory()
+            .context("failed to get memory subsystem")?
+            .set_mem(cgroups_rs::memory::SetMemory {
+                low: None,
+                high: Some(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64)),
+                min: None,
+                max: None,
+            })
+            .context("failed to set memory.high")
+    }
+
+    /// Set cgroup memory.high and memory.max.
+    pub fn set_limits(&self, limits: &MemoryLimits) -> anyhow::Result<()> {
+        info!(
+            limits.high,
+            limits.max,
+            path = self.path(),
+            "writing new memory limits",
+        );
+        self.memory()
+            .context("failed to get memory subsystem while setting memory limits")?
+            .set_mem(cgroups_rs::memory::SetMemory {
+                min: None,
+                low: None,
+                high: Some(MaxValue::Value(
+                    u64::min(limits.high, i64::MAX as u64) as i64
+                )),
+                max: Some(MaxValue::Value(u64::min(limits.max, i64::MAX as u64) as i64)),
+            })
+            .context("failed to set memory limits")
+    }
+
+    /// Given some amount of available memory, set the desired cgroup memory limits
+    pub fn set_memory_limits(&mut self, available_memory: u64) -> anyhow::Result<()> {
+        let new_high = self.config.calculate_memory_high_value(available_memory);
+        let limits = MemoryLimits::new(new_high, available_memory);
+        info!(
+            path = self.path(),
+            memory = ?limits,
+            "setting cgroup memory",
+        );
+        self.set_limits(&limits)
+            .context("failed to set cgroup memory limits")?;
+        Ok(())
+    }
+
+    /// Get memory.high threshold.
+    pub fn get_high_bytes(&self) -> anyhow::Result<u64> {
+        let high = self
+            .memory()
+            .context("failed to get memory subsystem while getting memory statistics")?
+            .get_mem()
+            .map(|mem| mem.high)
+            .context("failed to get memory statistics from subsystem")?;
+        match high {
+            Some(MaxValue::Max) => Ok(i64::MAX as u64),
+            Some(MaxValue::Value(high)) => Ok(high as u64),
+            None => anyhow::bail!("failed to read memory.high from memory subsystem"),
+        }
+    }
+}
--- a/libs/vm_monitor/src/dispatcher.rs
+++ b/libs/vm_monitor/src/dispatcher.rs
@@ -0,0 +1,155 @@
+//! Managing the websocket connection and other signals in the monitor.
+//!
+//! Contains types that manage the interaction (not data interchange, see `protocol`)
+//! between informant and monitor, allowing us to to process and send messages in a
+//! straightforward way. The dispatcher also manages that signals that come from
+//! the cgroup (requesting upscale), and the signals that go to the cgroup
+//! (notifying it of upscale).
+
+use anyhow::{bail, Context};
+use axum::extract::ws::{Message, WebSocket};
+use futures::{
+    stream::{SplitSink, SplitStream},
+    SinkExt, StreamExt,
+};
+use tokio::sync::mpsc;
+use tracing::info;
+
+use crate::cgroup::Sequenced;
+use crate::protocol::{
+    OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, Resources, PROTOCOL_MAX_VERSION,
+    PROTOCOL_MIN_VERSION,
+};
+
+/// The central handler for all communications in the monitor.
+///
+/// The dispatcher has two purposes:
+/// 1. Manage the connection to the informant, sending and receiving messages.
+/// 2. Communicate with the cgroup manager, notifying it when upscale is received,
+///    and sending a message to the informant when the cgroup manager requests
+///    upscale.
+#[derive(Debug)]
+pub struct Dispatcher {
+    /// We read informant messages of of `source`
+    pub(crate) source: SplitStream<WebSocket>,
+
+    /// We send messages to the informant through `sink`
+    sink: SplitSink<WebSocket, Message>,
+
+    /// Used to notify the cgroup when we are upscaled.
+    pub(crate) notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
+
+    /// When the cgroup requests upscale it will send on this channel. In response
+    /// we send an `UpscaleRequst` to the agent.
+    pub(crate) request_upscale_events: mpsc::Receiver<()>,
+
+    /// The protocol version we have agreed to use with the informant. This is negotiated
+    /// during the creation of the dispatcher, and should be the highest shared protocol
+    /// version.
+    ///
+    // NOTE: currently unused, but will almost certainly be used in the futures
+    // as the protocol changes
+    #[allow(unused)]
+    pub(crate) proto_version: ProtocolVersion,
+}
+
+impl Dispatcher {
+    /// Creates a new dispatcher using the passed-in connection.
+    ///
+    /// Performs a negotiation with the informant to determine the highest protocol
+    /// version that both support. This consists of two steps:
+    /// 1. Wait for the informant to sent the range of protocols it supports.
+    /// 2. Send a protocol version that works for us as well, or an error if there
+    ///    is no compatible version.
+    pub async fn new(
+        stream: WebSocket,
+        notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
+        request_upscale_events: mpsc::Receiver<()>,
+    ) -> anyhow::Result<Self> {
+        let (mut sink, mut source) = stream.split();
+
+        // Figure out the highest protocol version we both support
+        info!("waiting for informant to send protocol version range");
+        let Some(message) = source.next().await else {
+            bail!("websocket connection closed while performing protocol handshake")
+        };
+
+        let message = message.context("failed to read protocol version range off connection")?;
+
+        let Message::Text(message_text) = message else {
+            // All messages should be in text form, since we don't do any
+            // pinging/ponging. See nhooyr/websocket's implementation and the
+            // informant/agent for more info
+            bail!("received non-text message during proocol handshake: {message:?}")
+        };
+
+        let monitor_range = ProtocolRange {
+            min: PROTOCOL_MIN_VERSION,
+            max: PROTOCOL_MAX_VERSION,
+        };
+
+        let informant_range: ProtocolRange = serde_json::from_str(&message_text)
+            .context("failed to deserialize protocol version range")?;
+
+        info!(range = ?informant_range, "received protocol version range");
+
+        let highest_shared_version = match monitor_range.highest_shared_version(&informant_range) {
+            Ok(version) => {
+                sink.send(Message::Text(
+                    serde_json::to_string(&ProtocolResponse::Version(version)).unwrap(),
+                ))
+                .await
+                .context("failed to notify informant of negotiated protocol version")?;
+                version
+            }
+            Err(e) => {
+                sink.send(Message::Text(
+                    serde_json::to_string(&ProtocolResponse::Error(format!(
+                        "Received protocol version range {} which does not overlap with {}",
+                        informant_range, monitor_range
+                    )))
+                    .unwrap(),
+                ))
+                .await
+                .context(
+                    "failed to notify informant of no overlap between protocol version ranges",
+                )?;
+                Err(e).context("error determining suitable protocol version range")?
+            }
+        };
+
+        Ok(Self {
+            sink,
+            source,
+            notify_upscale_events,
+            request_upscale_events,
+            proto_version: highest_shared_version,
+        })
+    }
+
+    /// Notify the cgroup manager that we have received upscale and wait for
+    /// the acknowledgement.
+    #[tracing::instrument(skip_all, fields(?resources))]
+    pub async fn notify_upscale(&self, resources: Sequenced<Resources>) -> anyhow::Result<()> {
+        self.notify_upscale_events
+            .send(resources)
+            .await
+            .context("failed to send resources and oneshot sender across channel")
+    }
+
+    /// Send a message to the informant.
+    ///
+    /// Although this function is small, it has one major benefit: it is the only
+    /// way to send data accross the connection, and you can only pass in a proper
+    /// `MonitorMessage`. Without safeguards like this, it's easy to accidentally
+    /// serialize the wrong thing and send it, since `self.sink.send` will take
+    /// any string.
+    pub async fn send(&mut self, message: OutboundMsg) -> anyhow::Result<()> {
+        info!(?message, "sending message");
+        let json = serde_json::to_string(&message).context("failed to serialize message")?;
+        self.sink
+            .send(Message::Text(json))
+            .await
+            .context("stream error sending message")
+    }
+}
--- a/libs/vm_monitor/src/filecache.rs
+++ b/libs/vm_monitor/src/filecache.rs
@@ -0,0 +1,306 @@
+//! Logic for configuring and scaling the Postgres file cache.
+
+use std::num::NonZeroU64;
+
+use crate::MiB;
+use anyhow::{anyhow, Context};
+use tokio_postgres::{types::ToSql, Client, NoTls, Row};
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info};
+
+/// Manages Postgres' file cache by keeping a connection open.
+#[derive(Debug)]
+pub struct FileCacheState {
+    client: Client,
+    conn_str: String,
+    pub(crate) config: FileCacheConfig,
+
+    /// A token for cancelling spawned threads during shutdown.
+    token: CancellationToken,
+}
+
+#[derive(Debug)]
+pub struct FileCacheConfig {
+    /// Whether the file cache is *actually* stored in memory (e.g. by writing to
+    /// a tmpfs or shmem file). If true, the size of the file cache will be counted against the
+    /// memory available for the cgroup.
+    pub(crate) in_memory: bool,
+
+    /// The size of the file cache, in terms of the size of the resource it consumes
+    /// (currently: only memory)
+    ///
+    /// For example, setting `resource_multipler = 0.75` gives the cache a target size of 75% of total
+    /// resources.
+    ///
+    /// This value must be strictly between 0 and 1.
+    resource_multiplier: f64,
+
+    /// The required minimum amount of memory, in bytes, that must remain available
+    /// after subtracting the file cache.
+    ///
+    /// This value must be non-zero.
+    min_remaining_after_cache: NonZeroU64,
+
+    /// Controls the rate of increase in the file cache's size as it grows from zero
+    /// (when total resources equals min_remaining_after_cache) to the desired size based on
+    /// `resource_multiplier`.
+    ///
+    /// A `spread_factor` of zero means that all additional resources will go to the cache until it
+    /// reaches the desired size. Setting `spread_factor` to N roughly means "for every 1 byte added to
+    /// the cache's size, N bytes are reserved for the rest of the system, until the cache gets to
+    /// its desired size".
+    ///
+    /// This value must be >= 0, and must retain an increase that is more than what would be given by
+    /// `resource_multiplier`. For example, setting `resource_multiplier` = 0.75 but `spread_factor` = 1
+    /// would be invalid, because `spread_factor` would induce only 50% usage - never reaching the 75%
+    /// as desired by `resource_multiplier`.
+    ///
+    /// `spread_factor` is too large if `(spread_factor + 1) * resource_multiplier >= 1`.
+    spread_factor: f64,
+}
+
+impl Default for FileCacheConfig {
+    fn default() -> Self {
+        Self {
+            in_memory: true,
+            // 75 %
+            resource_multiplier: 0.75,
+            // 640 MiB; (512 + 128)
+            min_remaining_after_cache: NonZeroU64::new(640 * MiB).unwrap(),
+            // ensure any increase in file cache size is split 90-10 with 10% to other memory
+            spread_factor: 0.1,
+        }
+    }
+}
+
+impl FileCacheConfig {
+    /// Make sure fields of the config are consistent.
+    pub fn validate(&self) -> anyhow::Result<()> {
+        // Single field validity
+        anyhow::ensure!(
+            0.0 < self.resource_multiplier && self.resource_multiplier < 1.0,
+            "resource_multiplier must be between 0.0 and 1.0 exclusive, got {}",
+            self.resource_multiplier
+        );
+        anyhow::ensure!(
+            self.spread_factor >= 0.0,
+            "spread_factor must be >= 0, got {}",
+            self.spread_factor
+        );
+
+        // Check that `resource_multiplier` and `spread_factor` are valid w.r.t. each other.
+        //
+        // As shown in `calculate_cache_size`, we have two lines resulting from `resource_multiplier` and
+        // `spread_factor`, respectively. They are:
+        //
+        //                 `total`           `min_remaining_after_cache`
+        //   size = ————————————————————— - —————————————————————————————
+        //           `spread_factor` + 1         `spread_factor` + 1
+        //
+        // and
+        //
+        //   size = `resource_multiplier` × total
+        //
+        // .. where `total` is the total resources. These are isomorphic to the typical 'y = mx + b'
+        // form, with y = "size" and x = "total".
+        //
+        // These lines intersect at:
+        //
+        //               `min_remaining_after_cache`
+        //   ———————————————————————————————————————————————————
+        //    1 - `resource_multiplier` × (`spread_factor` + 1)
+        //
+        // We want to ensure that this value (a) exists, and (b) is >= `min_remaining_after_cache`. This is
+        // guaranteed when '`resource_multiplier` × (`spread_factor` + 1)' is less than 1.
+        // (We also need it to be >= 0, but that's already guaranteed.)
+
+        let intersect_factor = self.resource_multiplier * (self.spread_factor + 1.0);
+        anyhow::ensure!(
+            intersect_factor < 1.0,
+            "incompatible resource_multipler and spread_factor"
+        );
+        Ok(())
+    }
+
+    /// Calculate the desired size of the cache, given the total memory
+    pub fn calculate_cache_size(&self, total: u64) -> u64 {
+        // *Note*: all units are in bytes, until the very last line.
+        let available = total.saturating_sub(self.min_remaining_after_cache.get());
+        if available == 0 {
+            return 0;
+        }
+
+        // Conversions to ensure we don't overflow from floating-point ops
+        let size_from_spread =
+            i64::max(0, (available as f64 / (1.0 + self.spread_factor)) as i64) as u64;
+
+        let size_from_normal = (total as f64 * self.resource_multiplier) as u64;
+
+        let byte_size = u64::min(size_from_spread, size_from_normal);
+
+        // The file cache operates in units of mebibytes, so the sizes we produce should
+        // be rounded to a mebibyte. We round down to be conservative.
+        byte_size / MiB * MiB
+    }
+}
+
+impl FileCacheState {
+    /// Connect to the file cache.
+    #[tracing::instrument(skip_all, fields(%conn_str, ?config))]
+    pub async fn new(
+        conn_str: &str,
+        config: FileCacheConfig,
+        token: CancellationToken,
+    ) -> anyhow::Result<Self> {
+        config.validate().context("file cache config is invalid")?;
+
+        info!(conn_str, "connecting to Postgres file cache");
+        let client = FileCacheState::connect(conn_str, token.clone())
+            .await
+            .context("failed to connect to postgres file cache")?;
+
+        let conn_str = conn_str.to_string();
+        Ok(Self {
+            client,
+            config,
+            conn_str,
+            token,
+        })
+    }
+
+    /// Connect to Postgres.
+    ///
+    /// Aborts the spawned thread if the kill signal is received. This is not
+    /// a method as it is called in [`FileCacheState::new`].
+    #[tracing::instrument(skip_all, fields(%conn_str))]
+    async fn connect(conn_str: &str, token: CancellationToken) -> anyhow::Result<Client> {
+        let (client, conn) = tokio_postgres::connect(conn_str, NoTls)
+            .await
+            .context("failed to connect to pg client")?;
+
+        // The connection object performs the actual communication with the database,
+        // so spawn it off to run on its own. See tokio-postgres docs.
+        crate::spawn_with_cancel(
+            token,
+            |res| {
+                if let Err(error) = res {
+                    error!(%error, "postgres error")
+                }
+            },
+            conn,
+        );
+
+        Ok(client)
+    }
+
+    /// Execute a query with a retry if necessary.
+    ///
+    /// If the initial query fails, we restart the database connection and attempt
+    /// if again.
+    #[tracing::instrument(skip_all, fields(%statement))]
+    pub async fn query_with_retry(
+        &mut self,
+        statement: &str,
+        params: &[&(dyn ToSql + Sync)],
+    ) -> anyhow::Result<Vec<Row>> {
+        match self
+            .client
+            .query(statement, params)
+            .await
+            .context("failed to execute query")
+        {
+            Ok(rows) => Ok(rows),
+            Err(e) => {
+                error!(error = ?e, "postgres error: {e} -> retrying");
+
+                let client = FileCacheState::connect(&self.conn_str, self.token.clone())
+                    .await
+                    .context("failed to connect to postgres file cache")?;
+                info!("successfully reconnected to postgres client");
+
+                // Replace the old client and attempt the query with the new one
+                self.client = client;
+                self.client
+                    .query(statement, params)
+                    .await
+                    .context("failed to execute query a second time")
+            }
+        }
+    }
+
+    /// Get the current size of the file cache.
+    #[tracing::instrument(skip_all)]
+    pub async fn get_file_cache_size(&mut self) -> anyhow::Result<u64> {
+        self.query_with_retry(
+            // The file cache GUC variable is in MiB, but the conversion with
+            // pg_size_bytes means that the end result we get is in bytes.
+            "SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit'));",
+            &[],
+        )
+        .await
+        .context("failed to query pg for file cache size")?
+        .first()
+        .ok_or_else(|| anyhow!("file cache size query returned no rows"))?
+        // pg_size_bytes returns a bigint which is the same as an i64.
+        .try_get::<_, i64>(0)
+        // Since the size of the table is not negative, the cast is sound.
+        .map(|bytes| bytes as u64)
+        .context("failed to extract file cache size from query result")
+    }
+
+    /// Attempt to set the file cache size, returning the size it was actually
+    /// set to.
+    #[tracing::instrument(skip_all, fields(%num_bytes))]
+    pub async fn set_file_cache_size(&mut self, num_bytes: u64) -> anyhow::Result<u64> {
+        let max_bytes = self
+            // The file cache GUC variable is in MiB, but the conversion with pg_size_bytes
+            // means that the end result we get is in bytes.
+            .query_with_retry(
+                "SELECT pg_size_bytes(current_setting('neon.max_file_cache_size'));",
+                &[],
+            )
+            .await
+            .context("failed to query pg for max file cache size")?
+            .first()
+            .ok_or_else(|| anyhow!("max file cache size query returned no rows"))?
+            .try_get::<_, i64>(0)
+            .map(|bytes| bytes as u64)
+            .context("failed to extract max file cache size from query result")?;
+
+        let max_mb = max_bytes / MiB;
+        let num_mb = u64::min(num_bytes, max_bytes) / MiB;
+
+        let capped = if num_bytes > max_bytes {
+            " (capped by maximum size)"
+        } else {
+            ""
+        };
+
+        info!(
+            size = num_mb,
+            max = max_mb,
+            "updating file cache size {capped}",
+        );
+
+        // note: even though the normal ways to get the cache size produce values with trailing "MB"
+        // (hence why we call pg_size_bytes in `get_file_cache_size`'s query), the format
+        // it expects to set the value is "integer number of MB" without trailing units.
+        // For some reason, this *really* wasn't working with normal arguments, so that's
+        // why we're constructing the query here.
+        self.client
+            .query(
+                &format!("ALTER SYSTEM SET neon.file_cache_size_limit = {};", num_mb),
+                &[],
+            )
+            .await
+            .context("failed to change file cache size limit")?;
+
+        // must use pg_reload_conf to have the settings change take effect
+        self.client
+            .execute("SELECT pg_reload_conf();", &[])
+            .await
+            .context("failed to reload config")?;
+
+        Ok(num_mb * MiB)
+    }
+}
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -0,0 +1,205 @@
+#![cfg(target_os = "linux")]
+
+use anyhow::Context;
+use axum::{
+    extract::{ws::WebSocket, State, WebSocketUpgrade},
+    response::Response,
+};
+use axum::{routing::get, Router, Server};
+use clap::Parser;
+use futures::Future;
+use std::{fmt::Debug, time::Duration};
+use sysinfo::{RefreshKind, System, SystemExt};
+use tokio::{sync::broadcast, task::JoinHandle};
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info};
+
+use runner::Runner;
+
+// Code that interfaces with agent
+pub mod dispatcher;
+pub mod protocol;
+
+pub mod cgroup;
+pub mod filecache;
+pub mod runner;
+
+/// The vm-monitor is an autoscaling component started by compute_ctl.
+///
+/// It carries out autoscaling decisions (upscaling/downscaling) and responds to
+/// memory pressure by making requests to the autoscaler-agent.
+#[derive(Debug, Parser)]
+pub struct Args {
+    /// The name of the cgroup we should monitor for memory.high events. This
+    /// is the cgroup that postgres should be running in.
+    #[arg(short, long)]
+    pub cgroup: Option<String>,
+
+    /// The connection string for the Postgres file cache we should manage.
+    #[arg(short, long)]
+    pub pgconnstr: Option<String>,
+
+    /// The address we should listen on for connection requests. For the
+    /// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
+    #[arg(short, long)]
+    pub addr: String,
+}
+
+impl Args {
+    pub fn addr(&self) -> &str {
+        &self.addr
+    }
+}
+
+/// The number of bytes in one mebibyte.
+#[allow(non_upper_case_globals)]
+const MiB: u64 = 1 << 20;
+
+/// Convert a quantity in bytes to a quantity in mebibytes, generally for display
+/// purposes. (Most calculations in this crate use bytes directly)
+pub fn bytes_to_mebibytes(bytes: u64) -> f32 {
+    (bytes as f32) / (MiB as f32)
+}
+
+pub fn get_total_system_memory() -> u64 {
+    System::new_with_specifics(RefreshKind::new().with_memory()).total_memory()
+}
+
+/// Global app state for the Axum server
+#[derive(Debug, Clone)]
+pub struct ServerState {
+    /// Used to close old connections.
+    ///
+    /// When a new connection is made, we send a message signalling to the old
+    /// connection to close.
+    pub sender: broadcast::Sender<()>,
+
+    /// Used to cancel all spawned threads in the monitor.
+    pub token: CancellationToken,
+
+    // The CLI args
+    pub args: &'static Args,
+}
+
+/// Spawn a thread that may get cancelled by the provided [`CancellationToken`].
+///
+/// This is mainly meant to be called with futures that will be pending for a very
+/// long time, or are not mean to return. If it is not desirable for the future to
+/// ever resolve, such as in the case of [`cgroup::CgroupWatcher::watch`], the error can
+/// be logged with `f`.
+pub fn spawn_with_cancel<T, F>(
+    token: CancellationToken,
+    f: F,
+    future: T,
+) -> JoinHandle<Option<T::Output>>
+where
+    T: Future + Send + 'static,
+    T::Output: Send + 'static,
+    F: FnOnce(&T::Output) + Send + 'static,
+{
+    tokio::spawn(async move {
+        tokio::select! {
+            _ = token.cancelled() => {
+                info!("received global kill signal");
+                None
+            }
+            res = future => {
+                f(&res);
+                Some(res)
+            }
+        }
+    })
+}
+
+/// The entrypoint to the binary.
+///
+/// Set up tracing, parse arguments, and start an http server.
+pub async fn start(args: &'static Args, token: CancellationToken) -> anyhow::Result<()> {
+    // This channel is used to close old connections. When a new connection is
+    // made, we send a message signalling to the old connection to close.
+    let (sender, _) = tokio::sync::broadcast::channel::<()>(1);
+
+    let app = Router::new()
+        // This route gets upgraded to a websocket connection. We only support
+        // one connection at a time, which we enforce by killing old connections
+        // when we receive a new one.
+        .route("/monitor", get(ws_handler))
+        .with_state(ServerState {
+            sender,
+            token,
+            args,
+        });
+
+    let addr = args.addr();
+    let bound = Server::try_bind(&addr.parse().expect("parsing address should not fail"))
+        .with_context(|| format!("failed to bind to {addr}"))?;
+
+    info!(addr, "server bound");
+
+    bound
+        .serve(app.into_make_service())
+        .await
+        .context("server exited")?;
+
+    Ok(())
+}
+
+/// Handles incoming websocket connections.
+///
+/// If we are already to connected to an informant, we kill that old connection
+/// and accept the new one.
+#[tracing::instrument(name = "/monitor", skip_all, fields(?args))]
+pub async fn ws_handler(
+    ws: WebSocketUpgrade,
+    State(ServerState {
+        sender,
+        token,
+        args,
+    }): State<ServerState>,
+) -> Response {
+    // Kill the old monitor
+    info!("closing old connection if there is one");
+    let _ = sender.send(());
+
+    // Start the new one. Wow, the cycle of death and rebirth
+    let closer = sender.subscribe();
+    ws.on_upgrade(|ws| start_monitor(ws, args, closer, token))
+}
+
+/// Starts the monitor. If startup fails or the monitor exits, an error will
+/// be logged and our internal state will be reset to allow for new connections.
+#[tracing::instrument(skip_all, fields(?args))]
+async fn start_monitor(
+    ws: WebSocket,
+    args: &Args,
+    kill: broadcast::Receiver<()>,
+    token: CancellationToken,
+) {
+    info!("accepted new websocket connection -> starting monitor");
+    let timeout = Duration::from_secs(4);
+    let monitor = tokio::time::timeout(
+        timeout,
+        Runner::new(Default::default(), args, ws, kill, token),
+    )
+    .await;
+    let mut monitor = match monitor {
+        Ok(Ok(monitor)) => monitor,
+        Ok(Err(error)) => {
+            error!(?error, "failed to create monitor");
+            return;
+        }
+        Err(_) => {
+            error!(
+                ?timeout,
+                "creating monitor timed out (probably waiting to receive protocol range)"
+            );
+            return;
+        }
+    };
+    info!("connected to informant");
+
+    match monitor.run().await {
+        Ok(()) => info!("monitor was killed due to new connection"),
+        Err(e) => error!(error = ?e, "monitor terminated unexpectedly"),
+    }
+}
--- a/libs/vm_monitor/src/protocol.rs
+++ b/libs/vm_monitor/src/protocol.rs
@@ -0,0 +1,241 @@
+//! Types representing protocols and actual informant-monitor messages.
+//!
+//! The pervasive use of serde modifiers throughout this module is to ease
+//! serialization on the go side. Because go does not have enums (which model
+//! messages well), it is harder to model messages, and we accomodate that with
+//! serde.
+//!
+//! *Note*: the informant sends and receives messages in different ways.
+//!
+//! The informant serializes messages in the form and then sends them. The use
+//! of `#[serde(tag = "type", content = "content")]` allows us to use `Type`
+//! to determine how to deserialize `Content`.
+//! ```ignore
+//! struct {
+//!     Content any
+//!     Type    string
+//!     Id      uint64
+//! }
+//! ```
+//! and receives messages in the form:
+//! ```ignore
+//! struct {
+//!     {fields embedded}
+//!     Type string
+//!     Id   uint64
+//! }
+//! ```
+//! After reading the type field, the informant will decode the entire message
+//! again, this time into the correct type using the embedded fields.
+//! Because the informant cannot just extract the json contained in a certain field
+//! (it initially deserializes to `map[string]interface{}`), we keep the fields
+//! at the top level, so the entire piece of json can be deserialized into a struct,
+//! such as a `DownscaleResult`, with the `Type` and `Id` fields ignored.
+
+use core::fmt;
+use std::cmp;
+
+use serde::{de::Error, Deserialize, Serialize};
+
+/// A Message we send to the informant.
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct OutboundMsg {
+    #[serde(flatten)]
+    pub(crate) inner: OutboundMsgKind,
+    pub(crate) id: usize,
+}
+
+impl OutboundMsg {
+    pub fn new(inner: OutboundMsgKind, id: usize) -> Self {
+        Self { inner, id }
+    }
+}
+
+/// The different underlying message types we can send to the informant.
+#[derive(Serialize, Deserialize, Debug, Clone)]
+#[serde(tag = "type")]
+pub enum OutboundMsgKind {
+    /// Indicates that the informant sent an invalid message, i.e, we couldn't
+    /// properly deserialize it.
+    InvalidMessage { error: String },
+    /// Indicates that we experienced an internal error while processing a message.
+    /// For example, if a cgroup operation fails while trying to handle an upscale,
+    /// we return `InternalError`.
+    InternalError { error: String },
+    /// Returned to the informant once we have finished handling an upscale. If the
+    /// handling was unsuccessful, an `InternalError` will get returned instead.
+    /// *Note*: this is a struct variant because of the way go serializes struct{}
+    UpscaleConfirmation {},
+    /// Indicates to the monitor that we are urgently requesting resources.
+    /// *Note*: this is a struct variant because of the way go serializes struct{}
+    UpscaleRequest {},
+    /// Returned to the informant once we have finished attempting to downscale. If
+    /// an error occured trying to do so, an `InternalError` will get returned instead.
+    /// However, if we are simply unsuccessful (for example, do to needing the resources),
+    /// that gets included in the `DownscaleResult`.
+    DownscaleResult {
+        // FIXME for the future (once the informant is deprecated)
+        // As of the time of writing, the informant/agent version of this struct is
+        // called api.DownscaleResult. This struct has uppercase fields which are
+        // serialized as such. Thus, we serialize using uppercase names so we don't
+        // have to make a breaking change to the agent<->informant protocol. Once
+        // the informant has been superseded by the monitor, we can add the correct
+        // struct tags to api.DownscaleResult without causing a breaking change,
+        // since we don't need to support the agent<->informant protocol anymore.
+        #[serde(rename = "Ok")]
+        ok: bool,
+        #[serde(rename = "Status")]
+        status: String,
+    },
+    /// Part of the bidirectional heartbeat. The heartbeat is initiated by the
+    /// informant.
+    /// *Note*: this is a struct variant because of the way go serializes struct{}
+    HealthCheck {},
+}
+
+/// A message received form the informant.
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct InboundMsg {
+    #[serde(flatten)]
+    pub(crate) inner: InboundMsgKind,
+    pub(crate) id: usize,
+}
+
+/// The different underlying message types we can receive from the informant.
+#[derive(Serialize, Deserialize, Debug, Clone)]
+#[serde(tag = "type", content = "content")]
+pub enum InboundMsgKind {
+    /// Indicates that the we sent an invalid message, i.e, we couldn't
+    /// properly deserialize it.
+    InvalidMessage { error: String },
+    /// Indicates that the informan experienced an internal error while processing
+    /// a message. For example, if it failed to request upsacle from the agent, it
+    /// would return an `InternalError`.
+    InternalError { error: String },
+    /// Indicates to us that we have been granted more resources. We should respond
+    /// with an `UpscaleConfirmation` when done handling the resources (increasins
+    /// file cache size, cgorup memory limits).
+    UpscaleNotification { granted: Resources },
+    /// A request to reduce resource usage. We should response with a `DownscaleResult`,
+    /// when done.
+    DownscaleRequest { target: Resources },
+    /// Part of the bidirectional heartbeat. The heartbeat is initiated by the
+    /// informant.
+    /// *Note*: this is a struct variant because of the way go serializes struct{}
+    HealthCheck {},
+}
+
+/// Represents the resources granted to a VM.
+#[derive(Serialize, Deserialize, Debug, Clone, Copy)]
+// Renamed because the agent/informant has multiple resources types:
+// `Resources` (milliCPU/memory slots)
+// `Allocation` (vCPU/bytes) <- what we correspond to
+#[serde(rename(serialize = "Allocation", deserialize = "Allocation"))]
+pub struct Resources {
+    /// Number of vCPUs
+    pub(crate) cpu: f64,
+    /// Bytes of memory
+    pub(crate) mem: u64,
+}
+
+impl Resources {
+    pub fn new(cpu: f64, mem: u64) -> Self {
+        Self { cpu, mem }
+    }
+}
+
+pub const PROTOCOL_MIN_VERSION: ProtocolVersion = ProtocolVersion::V1_0;
+pub const PROTOCOL_MAX_VERSION: ProtocolVersion = ProtocolVersion::V1_0;
+
+#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Ord, Eq, Serialize, Deserialize)]
+pub struct ProtocolVersion(u8);
+
+impl ProtocolVersion {
+    /// Represents v1.0 of the informant<-> monitor protocol - the initial version
+    ///
+    /// Currently the latest version.
+    const V1_0: ProtocolVersion = ProtocolVersion(1);
+}
+
+impl fmt::Display for ProtocolVersion {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match *self {
+            ProtocolVersion(0) => f.write_str("<invalid: zero>"),
+            ProtocolVersion::V1_0 => f.write_str("v1.0"),
+            other => write!(f, "<unknown: {other}>"),
+        }
+    }
+}
+
+/// A set of protocol bounds that determines what we are speaking.
+///
+/// These bounds are inclusive.
+#[derive(Debug)]
+pub struct ProtocolRange {
+    pub min: ProtocolVersion,
+    pub max: ProtocolVersion,
+}
+
+// Use a custom deserialize impl to ensure that `self.min <= self.max`
+impl<'de> Deserialize<'de> for ProtocolRange {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        #[derive(Deserialize)]
+        struct InnerProtocolRange {
+            min: ProtocolVersion,
+            max: ProtocolVersion,
+        }
+        let InnerProtocolRange { min, max } = InnerProtocolRange::deserialize(deserializer)?;
+        if min > max {
+            Err(D::Error::custom(format!(
+                "min version = {min} is greater than max version = {max}",
+            )))
+        } else {
+            Ok(ProtocolRange { min, max })
+        }
+    }
+}
+
+impl fmt::Display for ProtocolRange {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        if self.min == self.max {
+            f.write_fmt(format_args!("{}", self.max))
+        } else {
+            f.write_fmt(format_args!("{} to {}", self.min, self.max))
+        }
+    }
+}
+
+impl ProtocolRange {
+    /// Find the highest shared version between two `ProtocolRange`'s
+    pub fn highest_shared_version(&self, other: &Self) -> anyhow::Result<ProtocolVersion> {
+        // We first have to make sure the ranges are overlapping. Once we know
+        // this, we can merge the ranges by taking the max of the mins and the
+        // mins of the maxes.
+        if self.min > other.max {
+            anyhow::bail!(
+                "Non-overlapping bounds: other.max = {} was less than self.min = {}",
+                other.max,
+                self.min,
+            )
+        } else if self.max < other.min {
+            anyhow::bail!(
+                "Non-overlappinng bounds: self.max = {} was less than other.min = {}",
+                self.max,
+                other.min
+            )
+        } else {
+            Ok(cmp::min(self.max, other.max))
+        }
+    }
+}
+
+/// We send this to the monitor after negotiating which protocol to use
+#[derive(Serialize, Debug)]
+#[serde(rename_all = "camelCase")]
+pub enum ProtocolResponse {
+    Error(String),
+    Version(ProtocolVersion),
+}
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -0,0 +1,460 @@
+//! Exposes the `Runner`, which handles messages received from informant and
+//! sends upscale requests.
+//!
+//! This is the "Monitor" part of the monitor binary and is the main entrypoint for
+//! all functionality.
+
+use std::sync::Arc;
+use std::{fmt::Debug, mem};
+
+use anyhow::{bail, Context};
+use axum::extract::ws::{Message, WebSocket};
+use futures::StreamExt;
+use tokio::sync::broadcast;
+use tokio::sync::mpsc;
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info, warn};
+
+use crate::cgroup::{CgroupWatcher, MemoryLimits, Sequenced};
+use crate::dispatcher::Dispatcher;
+use crate::filecache::{FileCacheConfig, FileCacheState};
+use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
+use crate::{bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel, Args, MiB};
+
+/// Central struct that interacts with informant, dispatcher, and cgroup to handle
+/// signals from the informant.
+#[derive(Debug)]
+pub struct Runner {
+    config: Config,
+    filecache: Option<FileCacheState>,
+    cgroup: Option<Arc<CgroupWatcher>>,
+    dispatcher: Dispatcher,
+
+    /// We "mint" new message ids by incrementing this counter and taking the value.
+    ///
+    /// **Note**: This counter is always odd, so that we avoid collisions between the IDs generated
+    /// by us vs the autoscaler-agent.
+    counter: usize,
+
+    /// A signal to kill the main thread produced by `self.run()`. This is triggered
+    /// when the server receives a new connection. When the thread receives the
+    /// signal off this channel, it will gracefully shutdown.
+    kill: broadcast::Receiver<()>,
+}
+
+/// Configuration for a `Runner`
+#[derive(Debug)]
+pub struct Config {
+    /// `sys_buffer_bytes` gives the estimated amount of memory, in bytes, that the kernel uses before
+    /// handing out the rest to userspace. This value is the estimated difference between the
+    /// *actual* physical memory and the amount reported by `grep MemTotal /proc/meminfo`.
+    ///
+    /// For more information, refer to `man 5 proc`, which defines MemTotal as "Total usable RAM
+    /// (i.e., physical RAM minus a few reserved bits and the kernel binary code)".
+    ///
+    /// We only use `sys_buffer_bytes` when calculating the system memory from the *external* memory
+    /// size, rather than the self-reported memory size, according to the kernel.
+    ///
+    /// TODO: this field is only necessary while we still have to trust the autoscaler-agent's
+    /// upscale resource amounts (because we might not *actually* have been upscaled yet). This field
+    /// should be removed once we have a better solution there.
+    sys_buffer_bytes: u64,
+}
+
+impl Default for Config {
+    fn default() -> Self {
+        Self {
+            sys_buffer_bytes: 100 * MiB,
+        }
+    }
+}
+
+impl Runner {
+    /// Create a new monitor.
+    #[tracing::instrument(skip_all, fields(?config, ?args))]
+    pub async fn new(
+        config: Config,
+        args: &Args,
+        ws: WebSocket,
+        kill: broadcast::Receiver<()>,
+        token: CancellationToken,
+    ) -> anyhow::Result<Runner> {
+        anyhow::ensure!(
+            config.sys_buffer_bytes != 0,
+            "invalid monitor Config: sys_buffer_bytes cannot be 0"
+        );
+
+        // *NOTE*: the dispatcher and cgroup manager talk through these channels
+        // so make sure they each get the correct half, nothing is droppped, etc.
+        let (notified_send, notified_recv) = mpsc::channel(1);
+        let (requesting_send, requesting_recv) = mpsc::channel(1);
+
+        let dispatcher = Dispatcher::new(ws, notified_send, requesting_recv)
+            .await
+            .context("error creating new dispatcher")?;
+
+        let mut state = Runner {
+            config,
+            filecache: None,
+            cgroup: None,
+            dispatcher,
+            counter: 1, // NB: must be odd, see the comment about the field for more.
+            kill,
+        };
+
+        let mut file_cache_reserved_bytes = 0;
+        let mem = get_total_system_memory();
+
+        // We need to process file cache initialization before cgroup initialization, so that the memory
+        // allocated to the file cache is appropriately taken into account when we decide the cgroup's
+        // memory limits.
+        if let Some(connstr) = &args.pgconnstr {
+            info!("initializing file cache");
+            let config: FileCacheConfig = Default::default();
+            if !config.in_memory {
+                panic!("file cache not in-memory implemented")
+            }
+
+            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
+                .await
+                .context("failed to create file cache")?;
+
+            let size = file_cache
+                .get_file_cache_size()
+                .await
+                .context("error getting file cache size")?;
+
+            let new_size = file_cache.config.calculate_cache_size(mem);
+            info!(
+                initial = bytes_to_mebibytes(size),
+                new = bytes_to_mebibytes(new_size),
+                "setting initial file cache size",
+            );
+
+            // note: even if size == new_size, we want to explicitly set it, just
+            // to make sure that we have the permissions to do so
+            let actual_size = file_cache
+                .set_file_cache_size(new_size)
+                .await
+                .context("failed to set file cache size, possibly due to inadequate permissions")?;
+            if actual_size != new_size {
+                info!("file cache size actually got set to {actual_size}")
+            }
+            file_cache_reserved_bytes = actual_size;
+
+            state.filecache = Some(file_cache);
+        }
+
+        if let Some(name) = &args.cgroup {
+            let (mut cgroup, cgroup_event_stream) =
+                CgroupWatcher::new(name.clone(), requesting_send)
+                    .context("failed to create cgroup manager")?;
+
+            let available = mem - file_cache_reserved_bytes;
+
+            cgroup
+                .set_memory_limits(available)
+                .context("failed to set cgroup memory limits")?;
+
+            let cgroup = Arc::new(cgroup);
+
+            // Some might call this . . . cgroup v2
+            let cgroup_clone = Arc::clone(&cgroup);
+
+            spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move {
+                cgroup_clone.watch(notified_recv, cgroup_event_stream).await
+            });
+
+            state.cgroup = Some(cgroup);
+        } else {
+            // *NOTE*: We need to forget the sender so that its drop impl does not get ran.
+            // This allows us to poll it in `Monitor::run` regardless of whether we
+            // are managing a cgroup or not. If we don't forget it, all receives will
+            // immediately return an error because the sender is droped and it will
+            // claim all select! statements, effectively turning `Monitor::run` into
+            // `loop { fail to receive }`.
+            mem::forget(requesting_send);
+        }
+
+        Ok(state)
+    }
+
+    /// Attempt to downscale filecache + cgroup
+    #[tracing::instrument(skip_all, fields(?target))]
+    pub async fn try_downscale(&mut self, target: Resources) -> anyhow::Result<(bool, String)> {
+        // Nothing to adjust
+        if self.cgroup.is_none() && self.filecache.is_none() {
+            info!("no action needed for downscale (no cgroup or file cache enabled)");
+            return Ok((
+                true,
+                "monitor is not managing cgroup or file cache".to_string(),
+            ));
+        }
+
+        let requested_mem = target.mem;
+        let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
+        let expected_file_cache_mem_usage = self
+            .filecache
+            .as_ref()
+            .map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
+            .unwrap_or(0);
+        let mut new_cgroup_mem_high = 0;
+        if let Some(cgroup) = &self.cgroup {
+            new_cgroup_mem_high = cgroup
+                .config
+                .calculate_memory_high_value(usable_system_memory - expected_file_cache_mem_usage);
+
+            let current = cgroup
+                .current_memory_usage()
+                .context("failed to fetch cgroup memory")?;
+
+            if new_cgroup_mem_high < current + cgroup.config.memory_high_buffer_bytes {
+                let status = format!(
+                    "{}: {} MiB (new high) < {} (current usage) + {} (buffer)",
+                    "calculated memory.high too low",
+                    bytes_to_mebibytes(new_cgroup_mem_high),
+                    bytes_to_mebibytes(current),
+                    bytes_to_mebibytes(cgroup.config.memory_high_buffer_bytes)
+                );
+
+                info!(status, "discontinuing downscale");
+
+                return Ok((false, status));
+            }
+        }
+
+        // The downscaling has been approved. Downscale the file cache, then the cgroup.
+        let mut status = vec![];
+        let mut file_cache_mem_usage = 0;
+        if let Some(file_cache) = &mut self.filecache {
+            if !file_cache.config.in_memory {
+                panic!("file cache not in-memory unimplemented")
+            }
+
+            let actual_usage = file_cache
+                .set_file_cache_size(expected_file_cache_mem_usage)
+                .await
+                .context("failed to set file cache size")?;
+            file_cache_mem_usage = actual_usage;
+            let message = format!(
+                "set file cache size to {} MiB",
+                bytes_to_mebibytes(actual_usage)
+            );
+            info!("downscale: {message}");
+            status.push(message);
+        }
+
+        if let Some(cgroup) = &self.cgroup {
+            let available_memory = usable_system_memory - file_cache_mem_usage;
+
+            if file_cache_mem_usage != expected_file_cache_mem_usage {
+                new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
+            }
+
+            let limits = MemoryLimits::new(
+                // new_cgroup_mem_high is initialized to 0 but it is guarancontextd to not be here
+                // since it is properly initialized in the previous cgroup if let block
+                new_cgroup_mem_high,
+                available_memory,
+            );
+            cgroup
+                .set_limits(&limits)
+                .context("failed to set cgroup memory limits")?;
+
+            let message = format!(
+                "set cgroup memory.high to {} MiB, of new max {} MiB",
+                bytes_to_mebibytes(new_cgroup_mem_high),
+                bytes_to_mebibytes(available_memory)
+            );
+            info!("downscale: {message}");
+            status.push(message);
+        }
+
+        // TODO: make this status thing less jank
+        let status = status.join("; ");
+        Ok((true, status))
+    }
+
+    /// Handle new resources
+    #[tracing::instrument(skip_all, fields(?resources))]
+    pub async fn handle_upscale(&mut self, resources: Resources) -> anyhow::Result<()> {
+        if self.filecache.is_none() && self.cgroup.is_none() {
+            info!("no action needed for upscale (no cgroup or file cache enabled)");
+            return Ok(());
+        }
+
+        let new_mem = resources.mem;
+        let usable_system_memory = new_mem.saturating_sub(self.config.sys_buffer_bytes);
+
+        // Get the file cache's expected contribution to the memory usage
+        let mut file_cache_mem_usage = 0;
+        if let Some(file_cache) = &mut self.filecache {
+            if !file_cache.config.in_memory {
+                panic!("file cache not in-memory unimplemented");
+            }
+
+            let expected_usage = file_cache.config.calculate_cache_size(usable_system_memory);
+            info!(
+                target = bytes_to_mebibytes(expected_usage),
+                total = bytes_to_mebibytes(new_mem),
+                "updating file cache size",
+            );
+
+            let actual_usage = file_cache
+                .set_file_cache_size(expected_usage)
+                .await
+                .context("failed to set file cache size")?;
+
+            if actual_usage != expected_usage {
+                warn!(
+                    "file cache was set to a different size that we wanted: target = {} Mib, actual= {} Mib",
+                    bytes_to_mebibytes(expected_usage),
+                    bytes_to_mebibytes(actual_usage)
+                )
+            }
+            file_cache_mem_usage = actual_usage;
+        }
+
+        if let Some(cgroup) = &self.cgroup {
+            let available_memory = usable_system_memory - file_cache_mem_usage;
+            let new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
+            info!(
+                target = bytes_to_mebibytes(new_cgroup_mem_high),
+                total = bytes_to_mebibytes(new_mem),
+                name = cgroup.path(),
+                "updating cgroup memory.high",
+            );
+            let limits = MemoryLimits::new(new_cgroup_mem_high, available_memory);
+            cgroup
+                .set_limits(&limits)
+                .context("failed to set file cache size")?;
+        }
+
+        Ok(())
+    }
+
+    /// Take in a message and perform some action, such as downscaling or upscaling,
+    /// and return a message to be send back.
+    #[tracing::instrument(skip_all, fields(%id, message = ?inner))]
+    pub async fn process_message(
+        &mut self,
+        InboundMsg { inner, id }: InboundMsg,
+    ) -> anyhow::Result<Option<OutboundMsg>> {
+        match inner {
+            InboundMsgKind::UpscaleNotification { granted } => {
+                self.handle_upscale(granted)
+                    .await
+                    .context("failed to handle upscale")?;
+                self.dispatcher
+                    .notify_upscale(Sequenced::new(granted))
+                    .await
+                    .context("failed to notify notify cgroup of upscale")?;
+                Ok(Some(OutboundMsg::new(
+                    OutboundMsgKind::UpscaleConfirmation {},
+                    id,
+                )))
+            }
+            InboundMsgKind::DownscaleRequest { target } => self
+                .try_downscale(target)
+                .await
+                .context("failed to downscale")
+                .map(|(ok, status)| {
+                    Some(OutboundMsg::new(
+                        OutboundMsgKind::DownscaleResult { ok, status },
+                        id,
+                    ))
+                }),
+            InboundMsgKind::InvalidMessage { error } => {
+                warn!(
+                    %error, id, "received notification of an invalid message we sent"
+                );
+                Ok(None)
+            }
+            InboundMsgKind::InternalError { error } => {
+                warn!(error, id, "informant experienced an internal error");
+                Ok(None)
+            }
+            InboundMsgKind::HealthCheck {} => {
+                Ok(Some(OutboundMsg::new(OutboundMsgKind::HealthCheck {}, id)))
+            }
+        }
+    }
+
+    // TODO: don't propagate errors, probably just warn!?
+    #[tracing::instrument(skip_all)]
+    pub async fn run(&mut self) -> anyhow::Result<()> {
+        info!("starting dispatcher");
+        loop {
+            tokio::select! {
+                signal = self.kill.recv() => {
+                    match signal {
+                        Ok(()) => return Ok(()),
+                        Err(e) => bail!("failed to receive kill signal: {e}")
+                    }
+                }
+                // we need to propagate an upscale request
+                request = self.dispatcher.request_upscale_events.recv() => {
+                    if request.is_none() {
+                        bail!("failed to listen for upscale event from cgroup")
+                    }
+                    info!("cgroup asking for upscale; forwarding request");
+                    self.counter += 2; // Increment, preserving parity (i.e. keep the
+                                       // counter odd). See the field comment for more.
+                    self.dispatcher
+                        .send(OutboundMsg::new(OutboundMsgKind::UpscaleRequest {}, self.counter))
+                        .await
+                        .context("failed to send message")?;
+                }
+                // there is a message from the informant
+                msg = self.dispatcher.source.next() => {
+                    if let Some(msg) = msg {
+                        // Don't use 'message' as a key as the string also uses
+                        // that for its key
+                        info!(?msg, "received message");
+                        match msg {
+                            Ok(msg) => {
+                                let message: InboundMsg = match msg {
+                                    Message::Text(text) => {
+                                        serde_json::from_str(&text).context("failed to deserialize text message")?
+                                    }
+                                    other => {
+                                        warn!(
+                                            // Don't use 'message' as a key as the
+                                            // string also uses that for its key
+                                            msg = ?other,
+                                            "informant should only send text messages but received different type"
+                                        );
+                                        continue
+                                    },
+                                };
+
+                                let out = match self.process_message(message.clone()).await {
+                                    Ok(Some(out)) => out,
+                                    Ok(None) => continue,
+                                    Err(e) => {
+                                        let error = e.to_string();
+                                        warn!(?error, "error handling message");
+                                        OutboundMsg::new(
+                                            OutboundMsgKind::InternalError {
+                                                error
+                                            },
+                                            message.id
+                                        )
+                                    }
+                                };
+
+                                self.dispatcher
+                                    .send(out)
+                                    .await
+                                    .context("failed to send message")?;
+                            }
+                            Err(e) => warn!("{e}"),
+                        }
+                    } else {
+                        anyhow::bail!("dispatcher connection closed")
+                    }
+                }
+            }
+        }
+    }
+}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -51,6 +51,7 @@ serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 serde_with.workspace = true
 signal-hook.workspace = true
+smallvec = { workspace = true, features = ["write"] }
 svg_fmt.workspace = true
 sync_wrapper.workspace = true
 tokio-tar.workspace = true
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -215,7 +215,6 @@ fn bench_sequential(c: &mut Criterion) {
            TimelineId::generate(),
            zero.add(10 * i32)..zero.add(10 * i32 + 1),
            Lsn(i),
-            false,
            0,
        );
        updates.insert_historic(layer);
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -23,6 +23,7 @@
 //!      <https://grafana.com/tutorials/build-a-panel-plugin/>
 use anyhow::Result;
 use pageserver::repository::Key;
+use pageserver::METADATA_FILE_NAME;
 use std::cmp::Ordering;
 use std::io::{self, BufRead};
 use std::path::PathBuf;
@@ -71,6 +72,10 @@ pub fn main() -> Result<()> {
        let line = PathBuf::from_str(&line).unwrap();
        let filename = line.file_name().unwrap();
        let filename = filename.to_str().unwrap();
+        if filename == METADATA_FILE_NAME {
+            // Don't try and parse "metadata" like a key-lsn range
+            continue;
+        }
        let range = parse_filename(filename);
        ranges.push(range);
    }
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -10,7 +10,7 @@ use std::{fs, path::Path, str};

 use pageserver::page_cache::PAGE_SZ;
 use pageserver::repository::{Key, KEY_SIZE};
-use pageserver::tenant::block_io::{BlockReader, FileBlockReader};
+use pageserver::tenant::block_io::FileBlockReader;
 use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
 use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
 use pageserver::tenant::storage_layer::range_overlaps;
@@ -107,23 +107,25 @@ async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
    // min-heap (reserve space for one more element added before eviction)
    let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
    let mut prev_key: Option<Key> = None;
-    tree_reader.visit(
-        &[0u8; DELTA_KEY_SIZE],
-        VisitDirection::Forwards,
-        |key, _value| {
-            let curr = Key::from_slice(&key[..KEY_SIZE]);
-            if let Some(prev) = prev_key {
-                if curr.to_i128() - prev.to_i128() >= MIN_HOLE_LENGTH {
-                    heap.push(Hole(prev..curr));
-                    if heap.len() > max_holes {
-                        heap.pop(); // remove smallest hole
+    tree_reader
+        .visit(
+            &[0u8; DELTA_KEY_SIZE],
+            VisitDirection::Forwards,
+            |key, _value| {
+                let curr = Key::from_slice(&key[..KEY_SIZE]);
+                if let Some(prev) = prev_key {
+                    if curr.to_i128() - prev.to_i128() >= MIN_HOLE_LENGTH {
+                        heap.push(Hole(prev..curr));
+                        if heap.len() > max_holes {
+                            heap.pop(); // remove smallest hole
+                        }
                    }
                }
-            }
-            prev_key = Some(curr.next());
-            true
-        },
-    )?;
+                prev_key = Some(curr.next());
+                true
+            },
+        )
+        .await?;
    let mut holes = heap.into_vec();
    holes.sort_by_key(|hole| hole.0.start);
    Ok(holes)
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -44,8 +44,6 @@ pub(crate) enum LayerCmd {
 }

 async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
-    use pageserver::tenant::block_io::BlockReader;
-
    let path = path.as_ref();
    virtual_file::init(10);
    page_cache::init(100);
@@ -59,18 +57,20 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
    );
    // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
    let mut all = vec![];
-    tree_reader.visit(
-        &[0u8; DELTA_KEY_SIZE],
-        VisitDirection::Forwards,
-        |key, value_offset| {
-            let curr = Key::from_slice(&key[..KEY_SIZE]);
-            all.push((curr, BlobRef(value_offset)));
-            true
-        },
-    )?;
-    let mut cursor = BlockCursor::new(&file);
+    tree_reader
+        .visit(
+            &[0u8; DELTA_KEY_SIZE],
+            VisitDirection::Forwards,
+            |key, value_offset| {
+                let curr = Key::from_slice(&key[..KEY_SIZE]);
+                all.push((curr, BlobRef(value_offset)));
+                true
+            },
+        )
+        .await?;
+    let cursor = BlockCursor::new_fileblockreader_virtual(&file);
    for (k, v) in all {
-        let value = cursor.read_blob(v.pos())?;
+        let value = cursor.read_blob(v.pos()).await?;
        println!("key:{} value_len:{}", k, value.len());
    }
    // TODO(chi): special handling for last key?
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -6,11 +6,14 @@ use std::{env, ops::ControlFlow, path::Path, str::FromStr};

 use anyhow::{anyhow, Context};
 use clap::{Arg, ArgAction, Command};
-use fail::FailScenario;
+
 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
+use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
+use pageserver::tenant::TenantSharedResources;
 use remote_storage::GenericRemoteStorage;
+use tokio::time::Instant;
 use tracing::*;

 use metrics::set_build_info_metric;
@@ -38,8 +41,6 @@ const PID_FILE_NAME: &str = "pageserver.pid";
 const FEATURES: &[&str] = &[
    #[cfg(feature = "testing")]
    "testing",
-    #[cfg(feature = "fail/failpoints")]
-    "fail/failpoints",
 ];

 fn version() -> String {
@@ -121,7 +122,7 @@ fn main() -> anyhow::Result<()> {
    }

    // Initialize up failpoints support
-    let scenario = FailScenario::setup();
+    let scenario = pageserver::failpoint_support::init();

    // Basic initialization of things that don't change after startup
    virtual_file::init(conf.max_file_descriptors);
@@ -226,6 +227,19 @@ fn start_pageserver(
    launch_ts: &'static LaunchTimestamp,
    conf: &'static PageServerConf,
 ) -> anyhow::Result<()> {
+    // Monotonic time for later calculating startup duration
+    let started_startup_at = Instant::now();
+
+    let startup_checkpoint = move |phase: &str, human_phase: &str| {
+        let elapsed = started_startup_at.elapsed();
+        let secs = elapsed.as_secs_f64();
+        STARTUP_DURATION.with_label_values(&[phase]).set(secs);
+        info!(
+            elapsed_ms = elapsed.as_millis(),
+            "{human_phase} ({secs:.3}s since start)"
+        )
+    };
+
    // Print version and launch timestamp to the log,
    // and expose them as prometheus metrics.
    // A changed version string indicates changed software.
@@ -335,6 +349,11 @@ fn start_pageserver(
    // Set up remote storage client
    let remote_storage = create_remote_storage_client(conf)?;

+    // Up to this point no significant I/O has been done: this should have been fast.  Record
+    // duration prior to starting I/O intensive phase of startup.
+    startup_checkpoint("initial", "Starting loading tenants");
+    STARTUP_IS_LOADING.set(1);
+
    // Startup staging or optimizing:
    //
    // We want to minimize downtime for `page_service` connections, and trying not to overload
@@ -355,18 +374,19 @@ fn start_pageserver(
    let order = pageserver::InitializationOrder {
        initial_tenant_load: Some(init_done_tx),
        initial_logical_size_can_start: init_done_rx.clone(),
-        initial_logical_size_attempt: init_logical_size_done_tx,
+        initial_logical_size_attempt: Some(init_logical_size_done_tx),
        background_jobs_can_start: background_jobs_barrier.clone(),
    };

    // Scan the local 'tenants/' directory and start loading the tenants
-    let init_started_at = std::time::Instant::now();
    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();

    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
-        broker_client.clone(),
-        remote_storage.clone(),
+        TenantSharedResources {
+            broker_client: broker_client.clone(),
+            remote_storage: remote_storage.clone(),
+        },
        order,
    ))?;

@@ -378,18 +398,13 @@ fn start_pageserver(
            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));

            init_done_rx.wait().await;
+            startup_checkpoint("initial_tenant_load", "Initial load completed");
+            STARTUP_IS_LOADING.set(0);
+
            // initial logical sizes can now start, as they were waiting on init_done_rx.

            scopeguard::ScopeGuard::into_inner(guard);

-            let init_done = std::time::Instant::now();
-            let elapsed = init_done - init_started_at;
-
-            tracing::info!(
-                elapsed_millis = elapsed.as_millis(),
-                "Initial load completed"
-            );
-
            let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());

            let timeout = conf.background_task_maximum_delay;
@@ -398,12 +413,7 @@ fn start_pageserver(

            let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
                Ok(_) => {
-                    let now = std::time::Instant::now();
-                    tracing::info!(
-                        from_init_done_millis = (now - init_done).as_millis(),
-                        from_init_millis = (now - init_started_at).as_millis(),
-                        "Initial logical sizes completed"
-                    );
+                    startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed");
                    None
                }
                Err(_) => {
@@ -419,6 +429,7 @@ fn start_pageserver(

            // allow background jobs to start
            drop(background_jobs_can_start);
+            startup_checkpoint("background_jobs_can_start", "Starting background jobs");

            if let Some(init_sizes_done) = init_sizes_done {
                // ending up here is not a bug; at the latest logical sizes will be queried by
@@ -428,14 +439,11 @@ fn start_pageserver(

                scopeguard::ScopeGuard::into_inner(guard);

-                let now = std::time::Instant::now();
-                tracing::info!(
-                    from_init_done_millis = (now - init_done).as_millis(),
-                    from_init_millis = (now - init_started_at).as_millis(),
-                    "Initial logical sizes completed after timeout (background jobs already started)"
-                );
+                startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed after timeout (background jobs already started)");

            }
+
+            startup_checkpoint("complete", "Startup complete");
        };

        async move {
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -31,7 +31,9 @@ use utils::{
 use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
+use crate::tenant::{
+    TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
+};
 use crate::{
    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
    TIMELINE_UNINIT_MARK_SUFFIX,
@@ -613,6 +615,11 @@ impl PageServerConf {
        )
    }

+    pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
+        self.tenant_path(tenant_id)
+            .join(TENANT_DELETED_MARKER_FILE_NAME)
+    }
+
    pub fn traces_path(&self) -> PathBuf {
        self.workdir.join("traces")
    }
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -7,27 +7,23 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::{mgr, LogicalSizeCalculationCause};
 use anyhow;
-use chrono::Utc;
+use chrono::{DateTime, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use pageserver_api::models::TenantState;
 use reqwest::Url;
 use serde::Serialize;
 use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
-use std::time::Duration;
+use std::sync::Arc;
+use std::time::{Duration, SystemTime};
 use tracing::*;
 use utils::id::{NodeId, TenantId, TimelineId};
-
-const WRITTEN_SIZE: &str = "written_size";
-const SYNTHETIC_STORAGE_SIZE: &str = "synthetic_storage_size";
-const RESIDENT_SIZE: &str = "resident_size";
-const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
-const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
+use utils::lsn::Lsn;

 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);

 #[serde_as]
-#[derive(Serialize, Debug)]
+#[derive(Serialize, Debug, Clone, Copy)]
 struct Ids {
    #[serde_as(as = "DisplayFromStr")]
    tenant_id: TenantId,
@@ -38,10 +34,142 @@ struct Ids {

 /// Key that uniquely identifies the object, this metric describes.
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
-pub struct PageserverConsumptionMetricsKey {
-    pub tenant_id: TenantId,
-    pub timeline_id: Option<TimelineId>,
-    pub metric: &'static str,
+struct MetricsKey {
+    tenant_id: TenantId,
+    timeline_id: Option<TimelineId>,
+    metric: &'static str,
+}
+
+impl MetricsKey {
+    const fn absolute_values(self) -> AbsoluteValueFactory {
+        AbsoluteValueFactory(self)
+    }
+    const fn incremental_values(self) -> IncrementalValueFactory {
+        IncrementalValueFactory(self)
+    }
+}
+
+/// Helper type which each individual metric kind can return to produce only absolute values.
+struct AbsoluteValueFactory(MetricsKey);
+
+impl AbsoluteValueFactory {
+    fn at(self, time: DateTime<Utc>, val: u64) -> (MetricsKey, (EventType, u64)) {
+        let key = self.0;
+        (key, (EventType::Absolute { time }, val))
+    }
+}
+
+/// Helper type which each individual metric kind can return to produce only incremental values.
+struct IncrementalValueFactory(MetricsKey);
+
+impl IncrementalValueFactory {
+    #[allow(clippy::wrong_self_convention)]
+    fn from_previous_up_to(
+        self,
+        prev_end: DateTime<Utc>,
+        up_to: DateTime<Utc>,
+        val: u64,
+    ) -> (MetricsKey, (EventType, u64)) {
+        let key = self.0;
+        // cannot assert prev_end < up_to because these are realtime clock based
+        (
+            key,
+            (
+                EventType::Incremental {
+                    start_time: prev_end,
+                    stop_time: up_to,
+                },
+                val,
+            ),
+        )
+    }
+
+    fn key(&self) -> &MetricsKey {
+        &self.0
+    }
+}
+
+// the static part of a MetricsKey
+impl MetricsKey {
+    /// Absolute value of [`Timeline::get_last_record_lsn`].
+    ///
+    /// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn
+    const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: "written_size",
+        }
+        .absolute_values()
+    }
+
+    /// Values will be the difference of the latest [`MetricsKey::written_size`] to what we
+    /// previously sent, starting from the previously sent incremental time range ending at the
+    /// latest absolute measurement.
+    const fn written_size_delta(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> IncrementalValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            // the name here is correctly about data not size, because that is what is wanted by
+            // downstream pipeline
+            metric: "written_data_bytes_delta",
+        }
+        .incremental_values()
+    }
+
+    /// Exact [`Timeline::get_current_logical_size`].
+    ///
+    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
+    const fn timeline_logical_size(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: "timeline_logical_size",
+        }
+        .absolute_values()
+    }
+
+    /// [`Tenant::remote_size`]
+    ///
+    /// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
+    const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: "remote_storage_size",
+        }
+        .absolute_values()
+    }
+
+    /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
+    ///
+    /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
+    const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: "resident_size",
+        }
+        .absolute_values()
+    }
+
+    /// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
+    ///
+    /// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
+    const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: "synthetic_storage_size",
+        }
+        .absolute_values()
+    }
 }

 /// Main thread that serves metrics collection
@@ -79,7 +207,7 @@ pub async fn collect_metrics(
        .timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
        .build()
        .expect("Failed to create http client with timeout");
-    let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
+    let mut cached_metrics = HashMap::new();
    let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();

    loop {
@@ -119,15 +247,15 @@ pub async fn collect_metrics(
 ///
 /// TODO
 /// - refactor this function (chunking+sending part) to reuse it in proxy module;
-pub async fn collect_metrics_iteration(
+async fn collect_metrics_iteration(
    client: &reqwest::Client,
-    cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, u64>,
+    cached_metrics: &mut HashMap<MetricsKey, (EventType, u64)>,
    metric_collection_endpoint: &reqwest::Url,
    node_id: NodeId,
    ctx: &RequestContext,
    send_cached: bool,
 ) {
-    let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new();
+    let mut current_metrics: Vec<(MetricsKey, (EventType, u64))> = Vec::new();
    trace!(
        "starting collect_metrics_iteration. metric_collection_endpoint: {}",
        metric_collection_endpoint
@@ -161,99 +289,65 @@ pub async fn collect_metrics_iteration(
        let mut tenant_resident_size = 0;

        // iterate through list of timelines in tenant
-        for timeline in tenant.list_timelines().iter() {
+        for timeline in tenant.list_timelines() {
            // collect per-timeline metrics only for active timelines
-            if timeline.is_active() {
-                let timeline_written_size = u64::from(timeline.get_last_record_lsn());

-                current_metrics.push((
-                    PageserverConsumptionMetricsKey {
+            let timeline_id = timeline.timeline_id;
+
+            match TimelineSnapshot::collect(&timeline, ctx) {
+                Ok(Some(snap)) => {
+                    snap.to_metrics(
                        tenant_id,
-                        timeline_id: Some(timeline.timeline_id),
-                        metric: WRITTEN_SIZE,
-                    },
-                    timeline_written_size,
-                ));
-
-                let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id);
-                match span.in_scope(|| timeline.get_current_logical_size(ctx)) {
-                    // Only send timeline logical size when it is fully calculated.
-                    Ok((size, is_exact)) if is_exact => {
-                        current_metrics.push((
-                            PageserverConsumptionMetricsKey {
-                                tenant_id,
-                                timeline_id: Some(timeline.timeline_id),
-                                metric: TIMELINE_LOGICAL_SIZE,
-                            },
-                            size,
-                        ));
-                    }
-                    Ok((_, _)) => {}
-                    Err(err) => {
-                        error!(
-                            "failed to get current logical size for timeline {}: {err:?}",
-                            timeline.timeline_id
-                        );
-                        continue;
-                    }
-                };
+                        timeline_id,
+                        Utc::now(),
+                        &mut current_metrics,
+                        cached_metrics,
+                    );
+                }
+                Ok(None) => {}
+                Err(e) => {
+                    error!(
+                        "failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}",
+                        timeline.timeline_id
+                    );
+                    continue;
+                }
            }

-            let timeline_resident_size = timeline.get_resident_physical_size();
-            tenant_resident_size += timeline_resident_size;
+            tenant_resident_size += timeline.resident_physical_size();
        }

-        match tenant.get_remote_size().await {
-            Ok(tenant_remote_size) => {
-                current_metrics.push((
-                    PageserverConsumptionMetricsKey {
-                        tenant_id,
-                        timeline_id: None,
-                        metric: REMOTE_STORAGE_SIZE,
-                    },
-                    tenant_remote_size,
-                ));
-            }
-            Err(err) => {
-                error!(
-                    "failed to get remote size for tenant {}: {err:?}",
-                    tenant_id
-                );
-            }
-        }
+        current_metrics
+            .push(MetricsKey::remote_storage_size(tenant_id).at(Utc::now(), tenant.remote_size()));

-        current_metrics.push((
-            PageserverConsumptionMetricsKey {
-                tenant_id,
-                timeline_id: None,
-                metric: RESIDENT_SIZE,
-            },
-            tenant_resident_size,
-        ));
+        current_metrics
+            .push(MetricsKey::resident_size(tenant_id).at(Utc::now(), tenant_resident_size));

        // Note that this metric is calculated in a separate bgworker
        // Here we only use cached value, which may lag behind the real latest one
-        let tenant_synthetic_size = tenant.get_cached_synthetic_size();
+        let synthetic_size = tenant.cached_synthetic_size();

-        if tenant_synthetic_size != 0 {
+        if synthetic_size != 0 {
            // only send non-zeroes because otherwise these show up as errors in logs
-            current_metrics.push((
-                PageserverConsumptionMetricsKey {
-                    tenant_id,
-                    timeline_id: None,
-                    metric: SYNTHETIC_STORAGE_SIZE,
-                },
-                tenant_synthetic_size,
-            ));
+            current_metrics
+                .push(MetricsKey::synthetic_size(tenant_id).at(Utc::now(), synthetic_size));
        }
    }

    // Filter metrics, unless we want to send all metrics, including cached ones.
    // See: https://github.com/neondatabase/neon/issues/3485
    if !send_cached {
-        current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) {
-            Some(val) => val != curr_val,
-            None => true,
+        current_metrics.retain(|(curr_key, (kind, curr_val))| {
+            if kind.is_incremental() {
+                // incremental values (currently only written_size_delta) should not get any cache
+                // deduplication because they will be used by upstream for "is still alive."
+                true
+            } else {
+                match cached_metrics.get(curr_key) {
+                    Some((_, val)) => val != curr_val,
+                    None => true,
+                }
+            }
        });
    }

@@ -268,14 +362,16 @@ pub async fn collect_metrics_iteration(

    let mut chunk_to_send: Vec<Event<Ids>> = Vec::with_capacity(CHUNK_SIZE);

+    let node_id = node_id.to_string();
+
    for chunk in chunks {
        chunk_to_send.clear();

        // enrich metrics with type,timestamp and idempotency key before sending
-        chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| Event {
-            kind: EventType::Absolute { time: Utc::now() },
+        chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event {
+            kind: *when,
            metric: curr_key.metric,
-            idempotency_key: idempotency_key(node_id.to_string()),
+            idempotency_key: idempotency_key(&node_id),
            value: *curr_val,
            extra: Ids {
                tenant_id: curr_key.tenant_id,
@@ -283,17 +379,14 @@ pub async fn collect_metrics_iteration(
            },
        }));

-        let chunk_json = serde_json::value::to_raw_value(&EventChunk {
-            events: &chunk_to_send,
-        })
-        .expect("PageserverConsumptionMetric should not fail serialization");
-
        const MAX_RETRIES: u32 = 3;

        for attempt in 0..MAX_RETRIES {
            let res = client
                .post(metric_collection_endpoint.clone())
-                .json(&chunk_json)
+                .json(&EventChunk {
+                    events: (&chunk_to_send).into(),
+                })
                .send()
                .await;

@@ -329,6 +422,130 @@ pub async fn collect_metrics_iteration(
    }
 }

+/// Internal type to make timeline metric production testable.
+///
+/// As this value type contains all of the information needed from a timeline to produce the
+/// metrics, it can easily be created with different values in test.
+struct TimelineSnapshot {
+    loaded_at: (Lsn, SystemTime),
+    last_record_lsn: Lsn,
+    current_exact_logical_size: Option<u64>,
+}
+
+impl TimelineSnapshot {
+    /// Collect the metrics from an actual timeline.
+    ///
+    /// Fails currently only when [`Timeline::get_current_logical_size`] fails.
+    ///
+    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
+    fn collect(
+        t: &Arc<crate::tenant::Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Option<Self>> {
+        use anyhow::Context;
+
+        if !t.is_active() {
+            // no collection for broken or stopping needed, we will still keep the cached values
+            // though at the caller.
+            Ok(None)
+        } else {
+            let loaded_at = t.loaded_at;
+            let last_record_lsn = t.get_last_record_lsn();
+
+            let current_exact_logical_size = {
+                let span = info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
+                let res = span
+                    .in_scope(|| t.get_current_logical_size(ctx))
+                    .context("get_current_logical_size");
+                match res? {
+                    // Only send timeline logical size when it is fully calculated.
+                    (size, is_exact) if is_exact => Some(size),
+                    (_, _) => None,
+                }
+            };
+
+            Ok(Some(TimelineSnapshot {
+                loaded_at,
+                last_record_lsn,
+                current_exact_logical_size,
+            }))
+        }
+    }
+
+    /// Produce the timeline consumption metrics into the `metrics` argument.
+    fn to_metrics(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        now: DateTime<Utc>,
+        metrics: &mut Vec<(MetricsKey, (EventType, u64))>,
+        cache: &HashMap<MetricsKey, (EventType, u64)>,
+    ) {
+        let timeline_written_size = u64::from(self.last_record_lsn);
+
+        let (key, written_size_now) =
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
+
+        // last_record_lsn can only go up, right now at least, TODO: #2592 or related
+        // features might change this.
+
+        let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
+
+        // use this when available, because in a stream of incremental values, it will be
+        // accurate where as when last_record_lsn stops moving, we will only cache the last
+        // one of those.
+        let last_stop_time = cache
+            .get(written_size_delta_key.key())
+            .map(|(until, _val)| {
+                until
+                    .incremental_timerange()
+                    .expect("never create EventType::Absolute for written_size_delta")
+                    .end
+            });
+
+        // by default, use the last sent written_size as the basis for
+        // calculating the delta. if we don't yet have one, use the load time value.
+        let prev = cache
+            .get(&key)
+            .map(|(prev_at, prev)| {
+                // use the prev time from our last incremental update, or default to latest
+                // absolute update on the first round.
+                let prev_at = prev_at
+                    .absolute_time()
+                    .expect("never create EventType::Incremental for written_size");
+                let prev_at = last_stop_time.unwrap_or(prev_at);
+                (*prev_at, *prev)
+            })
+            .unwrap_or_else(|| {
+                // if we don't have a previous point of comparison, compare to the load time
+                // lsn.
+                let (disk_consistent_lsn, loaded_at) = &self.loaded_at;
+                (DateTime::from(*loaded_at), disk_consistent_lsn.0)
+            });
+
+        // written_size_bytes_delta
+        metrics.extend(
+            if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
+                let up_to = written_size_now
+                    .0
+                    .absolute_time()
+                    .expect("never create EventType::Incremental for written_size");
+                let key_value = written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta);
+                Some(key_value)
+            } else {
+                None
+            },
+        );
+
+        // written_size
+        metrics.push((key, written_size_now));
+
+        if let Some(size) = self.current_exact_logical_size {
+            metrics.push(MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, size));
+        }
+    }
+}
+
 /// Caclculate synthetic size for each active tenant
 pub async fn calculate_synthetic_size_worker(
    synthetic_size_calculation_interval: Duration,
@@ -343,7 +560,7 @@ pub async fn calculate_synthetic_size_worker(
            _ = task_mgr::shutdown_watcher() => {
                return Ok(());
            },
-        tick_at = ticker.tick() => {
+            tick_at = ticker.tick() => {

                let tenants = match mgr::list_tenants().await {
                    Ok(tenants) => tenants,
@@ -379,3 +596,149 @@ pub async fn calculate_synthetic_size_worker(
        }
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashMap;
+
+    use std::time::SystemTime;
+    use utils::{
+        id::{TenantId, TimelineId},
+        lsn::Lsn,
+    };
+
+    use crate::consumption_metrics::MetricsKey;
+
+    use super::TimelineSnapshot;
+    use chrono::{DateTime, Utc};
+
+    #[test]
+    fn startup_collected_timeline_metrics_before_advancing() {
+        let tenant_id = TenantId::generate();
+        let timeline_id = TimelineId::generate();
+
+        let mut metrics = Vec::new();
+        let cache = HashMap::new();
+
+        let initdb_lsn = Lsn(0x10000);
+        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+        let snap = TimelineSnapshot {
+            loaded_at: (disk_consistent_lsn, SystemTime::now()),
+            last_record_lsn: disk_consistent_lsn,
+            current_exact_logical_size: Some(0x42000),
+        };
+
+        let now = DateTime::<Utc>::from(SystemTime::now());
+
+        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+        assert_eq!(
+            metrics,
+            &[
+                MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
+                    snap.loaded_at.1.into(),
+                    now,
+                    0
+                ),
+                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+            ]
+        );
+    }
+
+    #[test]
+    fn startup_collected_timeline_metrics_second_round() {
+        let tenant_id = TenantId::generate();
+        let timeline_id = TimelineId::generate();
+
+        let [now, before, init] = time_backwards();
+
+        let now = DateTime::<Utc>::from(now);
+        let before = DateTime::<Utc>::from(before);
+
+        let initdb_lsn = Lsn(0x10000);
+        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+        let mut metrics = Vec::new();
+        let cache = HashMap::from([
+            MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
+        ]);
+
+        let snap = TimelineSnapshot {
+            loaded_at: (disk_consistent_lsn, init),
+            last_record_lsn: disk_consistent_lsn,
+            current_exact_logical_size: Some(0x42000),
+        };
+
+        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+        assert_eq!(
+            metrics,
+            &[
+                MetricsKey::written_size_delta(tenant_id, timeline_id)
+                    .from_previous_up_to(before, now, 0),
+                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+            ]
+        );
+    }
+
+    #[test]
+    fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
+        let tenant_id = TenantId::generate();
+        let timeline_id = TimelineId::generate();
+
+        let [now, just_before, before, init] = time_backwards();
+
+        let now = DateTime::<Utc>::from(now);
+        let just_before = DateTime::<Utc>::from(just_before);
+        let before = DateTime::<Utc>::from(before);
+
+        let initdb_lsn = Lsn(0x10000);
+        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+        let mut metrics = Vec::new();
+        let cache = HashMap::from([
+            // at t=before was the last time the last_record_lsn changed
+            MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
+            // end time of this event is used for the next ones
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
+                before,
+                just_before,
+                0,
+            ),
+        ]);
+
+        let snap = TimelineSnapshot {
+            loaded_at: (disk_consistent_lsn, init),
+            last_record_lsn: disk_consistent_lsn,
+            current_exact_logical_size: Some(0x42000),
+        };
+
+        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+        assert_eq!(
+            metrics,
+            &[
+                MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
+                    just_before,
+                    now,
+                    0
+                ),
+                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+            ]
+        );
+    }
+
+    fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
+        let mut times = [std::time::SystemTime::UNIX_EPOCH; N];
+        times[0] = std::time::SystemTime::now();
+        for behind in 1..N {
+            times[behind] = times[0] - std::time::Duration::from_secs(behind as u64);
+        }
+
+        times
+    }
+}
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -85,6 +85,7 @@
 //! The solution is that all code paths are infected with precisely one
 //! [`RequestContext`] argument. Functions in the middle of the call chain
 //! only need to pass it on.
+
 use crate::task_mgr::TaskKind;

 // The main structure of this module, see module-level comment.
@@ -92,6 +93,7 @@ use crate::task_mgr::TaskKind;
 pub struct RequestContext {
    task_kind: TaskKind,
    download_behavior: DownloadBehavior,
+    access_stats_behavior: AccessStatsBehavior,
 }

 /// Desired behavior if the operation requires an on-demand download
@@ -109,6 +111,67 @@ pub enum DownloadBehavior {
    Error,
 }

+/// Whether this request should update access times used in LRU eviction
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub(crate) enum AccessStatsBehavior {
+    /// Update access times: this request's access to data should be taken
+    /// as a hint that the accessed layer is likely to be accessed again
+    Update,
+
+    /// Do not update access times: this request is accessing the layer
+    /// but does not want to indicate that the layer should be retained in cache,
+    /// perhaps because the requestor is a compaction routine that will soon cover
+    /// this layer with another.
+    Skip,
+}
+
+pub struct RequestContextBuilder {
+    inner: RequestContext,
+}
+
+impl RequestContextBuilder {
+    /// A new builder with default settings
+    pub fn new(task_kind: TaskKind) -> Self {
+        Self {
+            inner: RequestContext {
+                task_kind,
+                download_behavior: DownloadBehavior::Download,
+                access_stats_behavior: AccessStatsBehavior::Update,
+            },
+        }
+    }
+
+    pub fn extend(original: &RequestContext) -> Self {
+        Self {
+            // This is like a Copy, but avoid implementing Copy because ordinary users of
+            // RequestContext should always move or ref it.
+            inner: RequestContext {
+                task_kind: original.task_kind,
+                download_behavior: original.download_behavior,
+                access_stats_behavior: original.access_stats_behavior,
+            },
+        }
+    }
+
+    /// Configure the DownloadBehavior of the context: whether to
+    /// download missing layers, and/or warn on the download.
+    pub fn download_behavior(mut self, b: DownloadBehavior) -> Self {
+        self.inner.download_behavior = b;
+        self
+    }
+
+    /// Configure the AccessStatsBehavior of the context: whether layer
+    /// accesses should update the access time of the layer.
+    pub(crate) fn access_stats_behavior(mut self, b: AccessStatsBehavior) -> Self {
+        self.inner.access_stats_behavior = b;
+        self
+    }
+
+    pub fn build(self) -> RequestContext {
+        self.inner
+    }
+}
+
 impl RequestContext {
    /// Create a new RequestContext that has no parent.
    ///
@@ -123,10 +186,9 @@ impl RequestContext {
    /// because someone explicitly canceled it.
    /// It has no parent, so it cannot inherit cancellation from there.
    pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
-        RequestContext {
-            task_kind,
-            download_behavior,
-        }
+        RequestContextBuilder::new(task_kind)
+            .download_behavior(download_behavior)
+            .build()
    }

    /// Create a detached child context for a task that may outlive `self`.
@@ -187,10 +249,7 @@ impl RequestContext {
    }

    fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
-        RequestContext {
-            task_kind,
-            download_behavior,
-        }
+        Self::new(task_kind, download_behavior)
    }

    pub fn task_kind(&self) -> TaskKind {
@@ -200,4 +259,8 @@ impl RequestContext {
    pub fn download_behavior(&self) -> DownloadBehavior {
        self.download_behavior
    }
+
+    pub(crate) fn access_stats_behavior(&self) -> AccessStatsBehavior {
+        self.access_stats_behavior
+    }
 }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -60,7 +60,11 @@ use utils::serde_percent::Percent;
 use crate::{
    config::PageServerConf,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
-    tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
+    tenant::{
+        self,
+        storage_layer::{AsLayerDesc, EvictionError, Layer},
+        Timeline,
+    },
 };

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -108,7 +112,7 @@ pub fn launch_disk_usage_global_eviction_task(
                _ = background_jobs_barrier.wait() => { }
            };

-            disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
+            disk_usage_eviction_task(&state, task_config, &storage, &conf.tenants_path(), cancel)
                .await;
            Ok(())
        },
@@ -121,7 +125,7 @@ pub fn launch_disk_usage_global_eviction_task(
 async fn disk_usage_eviction_task(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    storage: GenericRemoteStorage,
+    _storage: &GenericRemoteStorage,
    tenants_dir: &Path,
    cancel: CancellationToken,
 ) {
@@ -145,14 +149,8 @@ async fn disk_usage_eviction_task(
        let start = Instant::now();

        async {
-            let res = disk_usage_eviction_task_iteration(
-                state,
-                task_config,
-                &storage,
-                tenants_dir,
-                &cancel,
-            )
-            .await;
+            let res =
+                disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await;

            match res {
                Ok(()) => {}
@@ -183,13 +181,12 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
 async fn disk_usage_eviction_task_iteration(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    storage: &GenericRemoteStorage,
    tenants_dir: &Path,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
        .context("get filesystem-level disk usage before evictions")?;
-    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
+    let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await;
    match res {
        Ok(outcome) => {
            debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -273,7 +270,6 @@ struct LayerCount {

 pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    state: &State,
-    storage: &GenericRemoteStorage,
    usage_pre: U,
    cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
@@ -304,17 +300,18 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // Debug-log the list of candidates
    let now = SystemTime::now();
    for (i, (partition, candidate)) in candidates.iter().enumerate() {
+        let desc = candidate.layer.layer_desc();
        debug!(
            "cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
            i + 1,
            candidates.len(),
-            candidate.layer.file_size(),
+            desc.file_size,
            now.duration_since(candidate.last_activity_ts)
                .unwrap()
                .as_micros(),
            partition,
-            candidate.layer.get_tenant_id(),
-            candidate.layer.get_timeline_id(),
+            desc.tenant_id,
+            desc.timeline_id,
            candidate.layer,
        );
    }
@@ -329,9 +326,10 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // If we get far enough in the list that we start to evict layers that are below
    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
    // usage at that point, in 'usage_planned_min_resident_size_respecting'.
-    let mut batched: HashMap<_, Vec<Arc<dyn PersistentLayer>>> = HashMap::new();
+    let mut batched: HashMap<_, Vec<_>> = HashMap::new();
    let mut warned = None;
    let mut usage_planned = usage_pre;
+    let mut max_batch_size = 0;
    for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
        if !usage_planned.has_pressure() {
            debug!(
@@ -346,12 +344,17 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
            warned = Some(usage_planned);
        }

-        usage_planned.add_available_bytes(candidate.layer.file_size());
+        usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);

-        batched
-            .entry(TimelineKey(candidate.timeline))
-            .or_default()
-            .push(candidate.layer);
+        let batch = batched.entry(TimelineKey(candidate.timeline)).or_default();
+
+        // semaphore will later be used to limit eviction concurrency, and we can express at
+        // most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
+        // but fail gracefully by not making batches larger.
+        if batch.len() < u32::MAX as usize {
+            batch.push(candidate.layer);
+            max_batch_size = max_batch_size.max(batch.len());
+        }
    }

    let usage_planned = match warned {
@@ -368,63 +371,101 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

    // phase2: evict victims batched by timeline

-    // After the loop, `usage_assumed` is the post-eviction usage,
-    // according to internal accounting.
-    let mut usage_assumed = usage_pre;
-    let mut evictions_failed = LayerCount::default();
+    let mut js = tokio::task::JoinSet::new();
+
+    // ratelimit to 1k files or any higher max batch size
+    let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
+
    for (timeline, batch) in batched {
        let tenant_id = timeline.tenant_id;
        let timeline_id = timeline.timeline_id;
-        let batch_size = batch.len();
+        let batch_size =
+            u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
+
+        // I dislike naming of `available_permits` but it means current total amount of permits
+        // because permits can be added
+        assert!(batch_size as usize <= limit.available_permits());

        debug!(%timeline_id, "evicting batch for timeline");

-        async {
-            let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;
+        let evict = {
+            let limit = limit.clone();
+            let cancel = cancel.clone();
+            async move {
+                let mut evicted_bytes = 0;
+                let mut evictions_failed = LayerCount::default();

-            match results {
-                Err(e) => {
-                    warn!("failed to evict batch: {:#}", e);
-                }
-                Ok(results) => {
-                    assert_eq!(results.len(), batch.len());
-                    for (result, layer) in results.into_iter().zip(batch.iter()) {
-                        match result {
-                            Some(Ok(())) => {
-                                usage_assumed.add_available_bytes(layer.file_size());
-                            }
-                            Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
-                                unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
-                            }
-                            Some(Err(EvictionError::FileNotFound)) => {
-                                evictions_failed.file_sizes += layer.file_size();
-                                evictions_failed.count += 1;
-                            }
-                            Some(Err(
-                                e @ EvictionError::LayerNotFound(_)
-                                | e @ EvictionError::StatFailed(_),
-                            )) => {
-                                let e = utils::error::report_compact_sources(&e);
-                                warn!(%layer, "failed to evict layer: {e}");
-                                evictions_failed.file_sizes += layer.file_size();
-                                evictions_failed.count += 1;
-                            }
-                            None => {
-                                assert!(cancel.is_cancelled());
-                                return;
+                let Ok(_permit) = limit.acquire_many_owned(batch_size).await else {
+                    // semaphore closing means cancelled
+                    return (evicted_bytes, evictions_failed);
+                };
+
+                let results = timeline.evict_layers(&batch, &cancel).await;
+
+                match results {
+                    Ok(results) => {
+                        assert_eq!(results.len(), batch.len());
+                        for (result, layer) in results.into_iter().zip(batch.iter()) {
+                            let file_size = layer.layer_desc().file_size;
+                            match result {
+                                Some(Ok(())) => {
+                                    evicted_bytes += file_size;
+                                }
+                                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
+                                    evictions_failed.file_sizes += file_size;
+                                    evictions_failed.count += 1;
+                                }
+                                None => {
+                                    assert!(cancel.is_cancelled());
+                                }
                            }
                        }
                    }
+                    Err(e) => {
+                        warn!("failed to evict batch: {:#}", e);
+                    }
                }
+                (evicted_bytes, evictions_failed)
            }
        }
-        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
-        .await;
+        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size));

-        if cancel.is_cancelled() {
+        js.spawn(evict);
+
+        // spwaning multiple thousands of these is essentially blocking, so give already spawned a
+        // chance of making progress
+        tokio::task::yield_now().await;
+    }
+
+    let join_all = async move {
+        // After the evictions, `usage_assumed` is the post-eviction usage,
+        // according to internal accounting.
+        let mut usage_assumed = usage_pre;
+        let mut evictions_failed = LayerCount::default();
+
+        while let Some(res) = js.join_next().await {
+            match res {
+                Ok((evicted_bytes, failed)) => {
+                    usage_assumed.add_available_bytes(evicted_bytes);
+                    evictions_failed.file_sizes += failed.file_sizes;
+                    evictions_failed.count += failed.count;
+                }
+                Err(je) if je.is_cancelled() => unreachable!("not used"),
+                Err(je) if je.is_panic() => { /* already logged */ }
+                Err(je) => tracing::error!("unknown JoinError: {je:?}"),
+            }
+        }
+        (usage_assumed, evictions_failed)
+    };
+
+    let (usage_assumed, evictions_failed) = tokio::select! {
+        tuple = join_all => { tuple },
+        _ = cancel.cancelled() => {
+            // close the semaphore to stop any pending acquires
+            limit.close();
            return Ok(IterationOutcome::Cancelled);
        }
-    }
+    };

    Ok(IterationOutcome::Finished(IterationOutcomeFinished {
        before: usage_pre,
@@ -439,7 +480,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
 #[derive(Clone)]
 struct EvictionCandidate {
    timeline: Arc<Timeline>,
-    layer: Arc<dyn PersistentLayer>,
+    layer: Layer,
    last_activity_ts: SystemTime,
 }

--- a/pageserver/src/failpoint_support.rs
+++ b/pageserver/src/failpoint_support.rs
@@ -0,0 +1,86 @@
+/// use with fail::cfg("$name", "return(2000)")
+///
+/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
+/// specified time (in milliseconds). The main difference is that we use async
+/// tokio sleep function. Another difference is that we print lines to the log,
+/// which can be useful in tests to check that the failpoint was hit.
+#[macro_export]
+macro_rules! __failpoint_sleep_millis_async {
+    ($name:literal) => {{
+        // If the failpoint is used with a "return" action, set should_sleep to the
+        // returned value (as string). Otherwise it's set to None.
+        let should_sleep = (|| {
+            ::fail::fail_point!($name, |x| x);
+            ::std::option::Option::None
+        })();
+
+        // Sleep if the action was a returned value
+        if let ::std::option::Option::Some(duration_str) = should_sleep {
+            $crate::failpoint_support::failpoint_sleep_helper($name, duration_str).await
+        }
+    }};
+}
+pub use __failpoint_sleep_millis_async as sleep_millis_async;
+
+// Helper function used by the macro. (A function has nicer scoping so we
+// don't need to decorate everything with "::")
+#[doc(hidden)]
+pub(crate) async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
+    let millis = duration_str.parse::<u64>().unwrap();
+    let d = std::time::Duration::from_millis(millis);
+
+    tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
+    tokio::time::sleep(d).await;
+    tracing::info!("failpoint {:?}: sleep done", name);
+}
+
+pub fn init() -> fail::FailScenario<'static> {
+    // The failpoints lib provides support for parsing the `FAILPOINTS` env var.
+    // We want non-default behavior for `exit`, though, so, we handle it separately.
+    //
+    // Format for FAILPOINTS is "name=actions" separated by ";".
+    let actions = std::env::var("FAILPOINTS");
+    if actions.is_ok() {
+        std::env::remove_var("FAILPOINTS");
+    } else {
+        // let the library handle non-utf8, or nothing for not present
+    }
+
+    let scenario = fail::FailScenario::setup();
+
+    if let Ok(val) = actions {
+        val.split(';')
+            .enumerate()
+            .map(|(i, s)| s.split_once('=').ok_or((i, s)))
+            .for_each(|res| {
+                let (name, actions) = match res {
+                    Ok(t) => t,
+                    Err((i, s)) => {
+                        panic!(
+                            "startup failpoints: missing action on the {}th failpoint; try `{s}=return`",
+                            i + 1,
+                        );
+                    }
+                };
+                if let Err(e) = apply_failpoint(name, actions) {
+                    panic!("startup failpoints: failed to apply failpoint {name}={actions}: {e}");
+                }
+            });
+    }
+
+    scenario
+}
+
+pub(crate) fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
+    if actions == "exit" {
+        fail::cfg_callback(name, exit_failpoint)
+    } else {
+        fail::cfg(name, actions)
+    }
+}
+
+#[inline(never)]
+fn exit_failpoint() {
+    tracing::info!("Exit requested by failpoint");
+    std::process::exit(1);
+}
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -93,6 +93,47 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+    delete:
+      description: |
+        Attempts to delete specified tenant. 500 and 409 errors should be retried until 404 is retrieved.
+        404 means that deletion successfully finished"
+      responses:
+        "400":
+          description: Error when no tenant id found in path
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "404":
+          description: Tenant not found
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/NotFoundError"
+        "409":
+          description: Deletion is already in progress, continue polling
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ConflictError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"

  /v1/tenant/{tenant_id}/timeline:
    parameters:
@@ -820,6 +861,7 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+
  /v1/tenant/config:
    put:
      description: |
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -187,7 +187,7 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
                format!("Cannot delete timeline which has child timelines: {children:?}")
                    .into_boxed_str(),
            ),
-            a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
+            a @ AlreadyInProgress(_) => ApiError::Conflict(a.to_string()),
            Other(e) => ApiError::InternalServerError(e),
        }
    }
@@ -208,6 +208,19 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
    }
 }

+impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
+    fn from(value: crate::tenant::delete::DeleteTenantError) -> Self {
+        use crate::tenant::delete::DeleteTenantError::*;
+        match value {
+            Get(g) => ApiError::from(g),
+            e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
+            Timeline(t) => ApiError::from(t),
+            Other(o) => ApiError::InternalServerError(o),
+            e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
+        }
+    }
+}
+
 // Helper function to construct a TimelineInfo struct for a timeline
 async fn build_timeline_info(
    timeline: &Arc<Timeline>,
@@ -504,7 +517,6 @@ async fn timeline_delete_handler(
        .instrument(info_span!("timeline_delete", %tenant_id, %timeline_id))
        .await?;

-    // FIXME: needs to be an error for console to retry it. Ideally Accepted should be used and retried until 404.
    json_response(StatusCode::ACCEPTED, ())
 }

@@ -617,6 +629,23 @@ async fn tenant_status(
    json_response(StatusCode::OK, tenant_info)
 }

+async fn tenant_delete_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    // TODO openapi spec
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let state = get_state(&request);
+
+    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id)
+        .instrument(info_span!("tenant_delete_handler", %tenant_id))
+        .await?;
+
+    json_response(StatusCode::ACCEPTED, ())
+}
+
 /// HTTP endpoint to query the current tenant_size of a tenant.
 ///
 /// This is not used by consumption metrics under [`crate::consumption_metrics`], but can be used
@@ -950,14 +979,7 @@ async fn failpoints_handler(

        // We recognize one extra "action" that's not natively recognized
        // by the failpoints crate: exit, to immediately kill the process
-        let cfg_result = if fp.actions == "exit" {
-            fail::cfg_callback(fp.name, || {
-                info!("Exit requested by failpoint");
-                std::process::exit(1);
-            })
-        } else {
-            fail::cfg(fp.name, &fp.actions)
-        };
+        let cfg_result = crate::failpoint_support::apply_failpoint(&fp.name, &fp.actions);

        if let Err(err_msg) = cfg_result {
            return Err(ApiError::BadRequest(anyhow!(
@@ -1006,7 +1028,7 @@ async fn timeline_compact_handler(
        timeline
            .compact(&cancel, &ctx)
            .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;
        json_response(StatusCode::OK, ())
    }
    .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
@@ -1031,7 +1053,7 @@ async fn timeline_checkpoint_handler(
        timeline
            .compact(&cancel, &ctx)
            .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;

        json_response(StatusCode::OK, ())
    }
@@ -1138,11 +1160,11 @@ async fn disk_usage_eviction_run(

    let state = get_state(&r);

-    let Some(storage) = state.remote_storage.clone() else {
+    if state.remote_storage.as_ref().is_none() {
        return Err(ApiError::InternalServerError(anyhow::anyhow!(
            "remote storage not configured, cannot run eviction iteration"
        )));
-    };
+    }

    let state = state.disk_usage_eviction_state.clone();

@@ -1160,7 +1182,6 @@ async fn disk_usage_eviction_run(
        async move {
            let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
                &state,
-                &storage,
                usage,
                &child_cancel,
            )
@@ -1345,6 +1366,9 @@ pub fn make_router(
        .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
        .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
+        .delete("/v1/tenant/:tenant_id", |r| {
+            api_handler(r, tenant_delete_handler)
+        })
        .get("/v1/tenant/:tenant_id/synthetic_size", |r| {
            api_handler(r, tenant_size_handler)
        })
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -7,7 +7,7 @@ pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub mod keyspace;
-pub(crate) mod metrics;
+pub mod metrics;
 pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
@@ -21,6 +21,8 @@ pub mod walingest;
 pub mod walrecord;
 pub mod walredo;

+pub mod failpoint_support;
+
 use std::path::Path;

 use crate::task_mgr::TaskKind;
@@ -47,50 +49,54 @@ pub use crate::metrics::preinitialize_metrics;

 #[tracing::instrument]
 pub async fn shutdown_pageserver(exit_code: i32) {
+    use std::time::Duration;
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
-    task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None).await;
+    timed(
+        task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None),
+        "shutdown LibpqEndpointListener",
+        Duration::from_secs(1),
+    )
+    .await;

    // Shut down any page service tasks.
-    task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None).await;
+    timed(
+        task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
+        "shutdown PageRequestHandlers",
+        Duration::from_secs(1),
+    )
+    .await;

    // Shut down all the tenants. This flushes everything to disk and kills
    // the checkpoint and GC tasks.
-    tenant::mgr::shutdown_all_tenants().await;
+    timed(
+        tenant::mgr::shutdown_all_tenants(),
+        "shutdown all tenants",
+        Duration::from_secs(5),
+    )
+    .await;

    // Shut down the HTTP endpoint last, so that you can still check the server's
    // status while it's shutting down.
    // FIXME: We should probably stop accepting commands like attach/detach earlier.
-    task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None).await;
+    timed(
+        task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None),
+        "shutdown http",
+        Duration::from_secs(1),
+    )
+    .await;

    // There should be nothing left, but let's be sure
-    task_mgr::shutdown_tasks(None, None, None).await;
+    timed(
+        task_mgr::shutdown_tasks(None, None, None),
+        "shutdown leftovers",
+        Duration::from_secs(1),
+    )
+    .await;
    info!("Shut down successfully completed");
    std::process::exit(exit_code);
 }

-const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
-const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
-
-async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
-    let backoff_duration_seconds =
-        exponential_backoff_duration_seconds(n, base_increment, max_seconds);
-    if backoff_duration_seconds > 0.0 {
-        info!(
-            "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
-        );
-        tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
-    }
-}
-
-pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
-    if n == 0 {
-        0.0
-    } else {
-        (1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
-    }
-}
-
 /// The name of the metadata file pageserver creates per timeline.
 /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`.
 pub const METADATA_FILE_NAME: &str = "metadata";
@@ -164,7 +170,7 @@ pub struct InitializationOrder {

    /// Each timeline owns a clone of this to be consumed on the initial logical size calculation
    /// attempt. It is important to drop this once the attempt has completed.
-    pub initial_logical_size_attempt: utils::completion::Completion,
+    pub initial_logical_size_attempt: Option<utils::completion::Completion>,

    /// Barrier for when we can start any background jobs.
    ///
@@ -172,33 +178,75 @@ pub struct InitializationOrder {
    pub background_jobs_can_start: utils::completion::Barrier,
 }

-#[cfg(test)]
-mod backoff_defaults_tests {
-    use super::*;
+/// Time the future with a warning when it exceeds a threshold.
+async fn timed<Fut: std::future::Future>(
+    fut: Fut,
+    name: &str,
+    warn_at: std::time::Duration,
+) -> <Fut as std::future::Future>::Output {
+    let started = std::time::Instant::now();

-    #[test]
-    fn backoff_defaults_produce_growing_backoff_sequence() {
-        let mut current_backoff_value = None;
+    let mut fut = std::pin::pin!(fut);

-        for i in 0..10_000 {
-            let new_backoff_value = exponential_backoff_duration_seconds(
-                i,
-                DEFAULT_BASE_BACKOFF_SECONDS,
-                DEFAULT_MAX_BACKOFF_SECONDS,
+    match tokio::time::timeout(warn_at, &mut fut).await {
+        Ok(ret) => {
+            tracing::info!(
+                task = name,
+                elapsed_ms = started.elapsed().as_millis(),
+                "completed"
+            );
+            ret
+        }
+        Err(_) => {
+            tracing::info!(
+                task = name,
+                elapsed_ms = started.elapsed().as_millis(),
+                "still waiting, taking longer than expected..."
            );

-            if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
-                assert!(
-                    old_backoff_value <= new_backoff_value,
-                    "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
-                )
-            }
-        }
+            let ret = fut.await;

-        assert_eq!(
-            current_backoff_value.expect("Should have produced backoff values to compare"),
-            DEFAULT_MAX_BACKOFF_SECONDS,
-            "Given big enough of retries, backoff should reach its allowed max value"
-        );
+            // this has a global allowed_errors
+            tracing::warn!(
+                task = name,
+                elapsed_ms = started.elapsed().as_millis(),
+                "completed, took longer than expected"
+            );
+
+            ret
+        }
+    }
+}
+
+#[cfg(test)]
+mod timed_tests {
+    use super::timed;
+    use std::time::Duration;
+
+    #[tokio::test]
+    async fn timed_completes_when_inner_future_completes() {
+        // A future that completes on time should have its result returned
+        let r1 = timed(
+            async move {
+                tokio::time::sleep(Duration::from_millis(10)).await;
+                123
+            },
+            "test 1",
+            Duration::from_millis(50),
+        )
+        .await;
+        assert_eq!(r1, 123);
+
+        // A future that completes too slowly should also have its result returned
+        let r1 = timed(
+            async move {
+                tokio::time::sleep(Duration::from_millis(50)).await;
+                456
+            },
+            "test 1",
+            Duration::from_millis(10),
+        )
+        .await;
+        assert_eq!(r1, 456);
    }
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,12 +1,12 @@
 use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
-    register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
-    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge,
-    register_uint_gauge_vec, Counter, CounterVec, Histogram, HistogramVec, IntCounter,
-    IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
+    register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
+    register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec,
+    register_uint_gauge, register_uint_gauge_vec, Counter, CounterVec, GaugeVec, Histogram,
+    HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
-use strum::VariantNames;
+use strum::{EnumCount, IntoEnumIterator, VariantNames};
 use strum_macros::{EnumVariantNames, IntoStaticStr};
 use utils::id::{TenantId, TimelineId};

@@ -394,6 +394,35 @@ pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(||
    .expect("failed to define a metric")
 });

+/// How long did we take to start up?  Broken down by labels to describe
+/// different phases of startup.
+pub static STARTUP_DURATION: Lazy<GaugeVec> = Lazy::new(|| {
+    register_gauge_vec!(
+        "pageserver_startup_duration_seconds",
+        "Time taken by phases of pageserver startup, in seconds",
+        &["phase"]
+    )
+    .expect("Failed to register pageserver_startup_duration_seconds metric")
+});
+
+pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_startup_is_loading",
+        "1 while in initial startup load of tenants, 0 at other times"
+    )
+    .expect("Failed to register pageserver_startup_is_loading")
+});
+
+/// How long did tenants take to go from construction to active state?
+pub(crate) static TENANT_ACTIVATION: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_tenant_activation_seconds",
+        "Time taken by tenants to activate, in seconds",
+        CRITICAL_OP_BUCKETS.into()
+    )
+    .expect("Failed to register pageserver_tenant_activation_seconds metric")
+});
+
 /// Each `Timeline`'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
 #[derive(Debug)]
 pub struct EvictionsWithLowResidenceDuration {
@@ -541,23 +570,160 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-const SMGR_QUERY_TIME_OPERATIONS: &[&str] = &[
-    "get_rel_exists",
-    "get_rel_size",
-    "get_page_at_lsn",
-    "get_db_size",
-];
+#[derive(Debug)]
+struct GlobalAndPerTimelineHistogram {
+    global: Histogram,
+    per_tenant_timeline: Histogram,
+}

-pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+impl GlobalAndPerTimelineHistogram {
+    fn observe(&self, value: f64) {
+        self.global.observe(value);
+        self.per_tenant_timeline.observe(value);
+    }
+}
+
+struct GlobalAndPerTimelineHistogramTimer<'a> {
+    h: &'a GlobalAndPerTimelineHistogram,
+    start: std::time::Instant,
+}
+
+impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> {
+    fn drop(&mut self) {
+        let elapsed = self.start.elapsed();
+        self.h.observe(elapsed.as_secs_f64());
+    }
+}
+
+#[derive(
+    Debug,
+    Clone,
+    Copy,
+    IntoStaticStr,
+    strum_macros::EnumCount,
+    strum_macros::EnumIter,
+    strum_macros::FromRepr,
+)]
+#[strum(serialize_all = "snake_case")]
+pub enum SmgrQueryType {
+    GetRelExists,
+    GetRelSize,
+    GetPageAtLsn,
+    GetDbSize,
+}
+
+#[derive(Debug)]
+pub struct SmgrQueryTimePerTimeline {
+    metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
+}
+
+static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_smgr_query_seconds",
-        "Time spent on smgr query handling",
+        "Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
        &["smgr_query_type", "tenant_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });

+static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_smgr_query_seconds_global",
+        "Time spent on smgr query handling, aggregated by query type.",
+        &["smgr_query_type"],
+        CRITICAL_OP_BUCKETS.into(),
+    )
+    .expect("failed to define a metric")
+});
+
+impl SmgrQueryTimePerTimeline {
+    pub(crate) fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
+        let tenant_id = tenant_id.to_string();
+        let timeline_id = timeline_id.to_string();
+        let metrics = std::array::from_fn(|i| {
+            let op = SmgrQueryType::from_repr(i).unwrap();
+            let global = SMGR_QUERY_TIME_GLOBAL
+                .get_metric_with_label_values(&[op.into()])
+                .unwrap();
+            let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
+                .get_metric_with_label_values(&[op.into(), &tenant_id, &timeline_id])
+                .unwrap();
+            GlobalAndPerTimelineHistogram {
+                global,
+                per_tenant_timeline,
+            }
+        });
+        Self { metrics }
+    }
+    pub(crate) fn start_timer(&self, op: SmgrQueryType) -> impl Drop + '_ {
+        let metric = &self.metrics[op as usize];
+        GlobalAndPerTimelineHistogramTimer {
+            h: metric,
+            start: std::time::Instant::now(),
+        }
+    }
+}
+
+#[cfg(test)]
+mod smgr_query_time_tests {
+    use strum::IntoEnumIterator;
+    use utils::id::{TenantId, TimelineId};
+
+    // Regression test, we used hard-coded string constants before using an enum.
+    #[test]
+    fn op_label_name() {
+        use super::SmgrQueryType::*;
+        let expect: [(super::SmgrQueryType, &'static str); 4] = [
+            (GetRelExists, "get_rel_exists"),
+            (GetRelSize, "get_rel_size"),
+            (GetPageAtLsn, "get_page_at_lsn"),
+            (GetDbSize, "get_db_size"),
+        ];
+        for (op, expect) in expect {
+            let actual: &'static str = op.into();
+            assert_eq!(actual, expect);
+        }
+    }
+
+    #[test]
+    fn basic() {
+        let ops: Vec<_> = super::SmgrQueryType::iter().collect();
+
+        for op in &ops {
+            let tenant_id = TenantId::generate();
+            let timeline_id = TimelineId::generate();
+            let metrics = super::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
+
+            let get_counts = || {
+                let global: u64 = ops
+                    .iter()
+                    .map(|op| metrics.metrics[*op as usize].global.get_sample_count())
+                    .sum();
+                let per_tenant_timeline: u64 = ops
+                    .iter()
+                    .map(|op| {
+                        metrics.metrics[*op as usize]
+                            .per_tenant_timeline
+                            .get_sample_count()
+                    })
+                    .sum();
+                (global, per_tenant_timeline)
+            };
+
+            let (pre_global, pre_per_tenant_timeline) = get_counts();
+            assert_eq!(pre_per_tenant_timeline, 0);
+
+            let timer = metrics.start_timer(*op);
+            drop(timer);
+
+            let (post_global, post_per_tenant_timeline) = get_counts();
+            assert_eq!(post_per_tenant_timeline, 1);
+            assert!(post_global > pre_global);
+        }
+    }
+}
+
 // keep in sync with control plane Go code so that we can validate
 // compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
 static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
@@ -1016,6 +1182,12 @@ impl Drop for TimelineMetrics {
            .write()
            .unwrap()
            .remove(tenant_id, timeline_id);
+
+        // The following metrics are born outside of the TimelineMetrics lifecycle but still
+        // removed at the end of it. The idea is to have the metrics outlive the
+        // entity during which they're observed, e.g., the smgr metrics shall
+        // outlive an individual smgr connection, but not the timeline.
+
        for op in StorageTimeOperation::VARIANTS {
            let _ =
                STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
@@ -1027,8 +1199,12 @@ impl Drop for TimelineMetrics {
            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
        }

-        for op in SMGR_QUERY_TIME_OPERATIONS {
-            let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
+        for op in SmgrQueryType::iter() {
+            let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
+                op.into(),
+                tenant_id,
+                timeline_id,
+            ]);
        }
    }
 }
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -10,6 +10,42 @@
 //! PostgreSQL buffer size, and a Slot struct for each buffer to contain
 //! information about what's stored in the buffer.
 //!
+//! # Types Of Pages
+//!
+//! [`PageCache`] only supports immutable pages.
+//! Hence there is no need to worry about coherency.
+//!
+//! Two types of pages are supported:
+//!
+//! * **Materialized pages**, filled & used by page reconstruction
+//! * **Immutable File pages**, filled & used by [`crate::tenant::block_io`] and [`crate::tenant::ephemeral_file`].
+//!
+//! Note that [`crate::tenant::ephemeral_file::EphemeralFile`] is generally mutable, but, it's append-only.
+//! It uses the page cache only for the blocks that are already fully written and immutable.
+//!
+//! # Filling The Page Cache
+//!
+//! Page cache maps from a cache key to a buffer slot.
+//! The cache key uniquely identifies the piece of data that is being cached.
+//!
+//! The cache key for **materialized pages** is  [`TenantId`], [`TimelineId`], [`Key`], and [`Lsn`].
+//! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access.
+//!
+//! The cache key for **immutable file** pages is [`FileId`] and a block number.
+//! Users of page cache that wish to page-cache an arbitrary (immutable!) on-disk file do the following:
+//! * Have a mechanism to deterministically associate the on-disk file with a [`FileId`].
+//! * Get a [`FileId`] using [`next_file_id`].
+//! * Use the mechanism to associate the on-disk file with the returned [`FileId`].
+//! * Use [`PageCache::read_immutable_buf`] to get a [`ReadBufResult`].
+//! * If the page was already cached, it'll be the [`ReadBufResult::Found`] variant that contains
+//!   a read guard for the page. Just use it.
+//! * If the page was not cached, it'll be the [`ReadBufResult::NotFound`] variant that contains
+//!   a write guard for the page. Fill the page with the contents of the on-disk file.
+//!   Then call [`PageWriteGuard::mark_valid`] to mark the page as valid.
+//!   Then try again to [`PageCache::read_immutable_buf`].
+//!   Unless there's high cache pressure, the page should now be cached.
+//!   (TODO: allow downgrading the write guard to a read guard to ensure forward progress.)
+//!
 //! # Locking
 //!
 //! There are two levels of locking involved: There's one lock for the "mapping"
@@ -40,20 +76,18 @@ use std::{
    collections::{hash_map::Entry, HashMap},
    convert::TryInto,
    sync::{
-        atomic::{AtomicU8, AtomicUsize, Ordering},
+        atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
        RwLock, RwLockReadGuard, RwLockWriteGuard, TryLockError,
    },
 };

 use anyhow::Context;
 use once_cell::sync::OnceCell;
-use tracing::error;
 use utils::{
    id::{TenantId, TimelineId},
    lsn::Lsn,
 };

-use crate::tenant::writeback_ephemeral_file;
 use crate::{metrics::PageCacheSizeMetrics, repository::Key};

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -87,6 +121,17 @@ pub fn get() -> &'static PageCache {
 pub const PAGE_SZ: usize = postgres_ffi::BLCKSZ as usize;
 const MAX_USAGE_COUNT: u8 = 5;

+/// See module-level comment.
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
+pub struct FileId(u64);
+
+static NEXT_ID: AtomicU64 = AtomicU64::new(1);
+
+/// See module-level comment.
+pub fn next_file_id() -> FileId {
+    FileId(NEXT_ID.fetch_add(1, Ordering::Relaxed))
+}
+
 ///
 /// CacheKey uniquely identifies a "thing" to cache in the page cache.
 ///
@@ -97,12 +142,8 @@ enum CacheKey {
        hash_key: MaterializedPageHashKey,
        lsn: Lsn,
    },
-    EphemeralPage {
-        file_id: u64,
-        blkno: u32,
-    },
    ImmutableFilePage {
-        file_id: u64,
+        file_id: FileId,
        blkno: u32,
    },
 }
@@ -128,7 +169,6 @@ struct Slot {
 struct SlotInner {
    key: Option<CacheKey>,
    buf: &'static mut [u8; PAGE_SZ],
-    dirty: bool,
 }

 impl Slot {
@@ -177,9 +217,7 @@ pub struct PageCache {
    /// can have a separate mapping map, next to this field.
    materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,

-    ephemeral_page_map: RwLock<HashMap<(u64, u32), usize>>,
-
-    immutable_page_map: RwLock<HashMap<(u64, u32), usize>>,
+    immutable_page_map: RwLock<HashMap<(FileId, u32), usize>>,

    /// The actual buffers with their metadata.
    slots: Box<[Slot]>,
@@ -258,14 +296,6 @@ impl PageWriteGuard<'_> {
        );
        self.valid = true;
    }
-    pub fn mark_dirty(&mut self) {
-        // only ephemeral pages can be dirty ATM.
-        assert!(matches!(
-            self.inner.key,
-            Some(CacheKey::EphemeralPage { .. })
-        ));
-        self.inner.dirty = true;
-    }
 }

 impl Drop for PageWriteGuard<'_> {
@@ -280,7 +310,6 @@ impl Drop for PageWriteGuard<'_> {
            let self_key = self.inner.key.as_ref().unwrap();
            PAGE_CACHE.get().unwrap().remove_mapping(self_key);
            self.inner.key = None;
-            self.inner.dirty = false;
        }
    }
 }
@@ -388,50 +417,16 @@ impl PageCache {
        Ok(())
    }

-    // Section 1.2: Public interface functions for working with Ephemeral pages.
+    // Section 1.2: Public interface functions for working with immutable file pages.

-    pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<ReadBufResult> {
-        let mut cache_key = CacheKey::EphemeralPage { file_id, blkno };
-
-        self.lock_for_read(&mut cache_key)
-    }
-
-    pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<WriteBufResult> {
-        let cache_key = CacheKey::EphemeralPage { file_id, blkno };
-
-        self.lock_for_write(&cache_key)
-    }
-
-    /// Immediately drop all buffers belonging to given file, without writeback
-    pub fn drop_buffers_for_ephemeral(&self, drop_file_id: u64) {
-        for slot_idx in 0..self.slots.len() {
-            let slot = &self.slots[slot_idx];
-
-            let mut inner = slot.inner.write().unwrap();
-            if let Some(key) = &inner.key {
-                match key {
-                    CacheKey::EphemeralPage { file_id, blkno: _ } if *file_id == drop_file_id => {
-                        // remove mapping for old buffer
-                        self.remove_mapping(key);
-                        inner.key = None;
-                        inner.dirty = false;
-                    }
-                    _ => {}
-                }
-            }
-        }
-    }
-
-    // Section 1.3: Public interface functions for working with immutable file pages.
-
-    pub fn read_immutable_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<ReadBufResult> {
+    pub fn read_immutable_buf(&self, file_id: FileId, blkno: u32) -> anyhow::Result<ReadBufResult> {
        let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };

        self.lock_for_read(&mut cache_key)
    }

-    /// Immediately drop all buffers belonging to given file, without writeback
-    pub fn drop_buffers_for_immutable(&self, drop_file_id: u64) {
+    /// Immediately drop all buffers belonging to given file
+    pub fn drop_buffers_for_immutable(&self, drop_file_id: FileId) {
        for slot_idx in 0..self.slots.len() {
            let slot = &self.slots[slot_idx];

@@ -444,7 +439,6 @@ impl PageCache {
                        // remove mapping for old buffer
                        self.remove_mapping(key);
                        inner.key = None;
-                        inner.dirty = false;
                    }
                    _ => {}
                }
@@ -522,10 +516,6 @@ impl PageCache {
            CacheKey::MaterializedPage { .. } => {
                unreachable!("Materialized pages use lookup_materialized_page")
            }
-            CacheKey::EphemeralPage { .. } => (
-                &crate::metrics::PAGE_CACHE.read_accesses_ephemeral,
-                &crate::metrics::PAGE_CACHE.read_hits_ephemeral,
-            ),
            CacheKey::ImmutableFilePage { .. } => (
                &crate::metrics::PAGE_CACHE.read_accesses_immutable,
                &crate::metrics::PAGE_CACHE.read_hits_immutable,
@@ -566,7 +556,6 @@ impl PageCache {
            // Make the slot ready
            let slot = &self.slots[slot_idx];
            inner.key = Some(cache_key.clone());
-            inner.dirty = false;
            slot.usage_count.store(1, Ordering::Relaxed);

            return Ok(ReadBufResult::NotFound(PageWriteGuard {
@@ -628,7 +617,6 @@ impl PageCache {
            // Make the slot ready
            let slot = &self.slots[slot_idx];
            inner.key = Some(cache_key.clone());
-            inner.dirty = false;
            slot.usage_count.store(1, Ordering::Relaxed);

            return Ok(WriteBufResult::NotFound(PageWriteGuard {
@@ -667,10 +655,6 @@ impl PageCache {
                *lsn = version.lsn;
                Some(version.slot_idx)
            }
-            CacheKey::EphemeralPage { file_id, blkno } => {
-                let map = self.ephemeral_page_map.read().unwrap();
-                Some(*map.get(&(*file_id, *blkno))?)
-            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let map = self.immutable_page_map.read().unwrap();
                Some(*map.get(&(*file_id, *blkno))?)
@@ -694,10 +678,6 @@ impl PageCache {
                    None
                }
            }
-            CacheKey::EphemeralPage { file_id, blkno } => {
-                let map = self.ephemeral_page_map.read().unwrap();
-                Some(*map.get(&(*file_id, *blkno))?)
-            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let map = self.immutable_page_map.read().unwrap();
                Some(*map.get(&(*file_id, *blkno))?)
@@ -731,12 +711,6 @@ impl PageCache {
                    panic!("could not find old key in mapping")
                }
            }
-            CacheKey::EphemeralPage { file_id, blkno } => {
-                let mut map = self.ephemeral_page_map.write().unwrap();
-                map.remove(&(*file_id, *blkno))
-                    .expect("could not find old key in mapping");
-                self.size_metrics.current_bytes_ephemeral.sub_page_sz(1);
-            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let mut map = self.immutable_page_map.write().unwrap();
                map.remove(&(*file_id, *blkno))
@@ -776,17 +750,7 @@ impl PageCache {
                    }
                }
            }
-            CacheKey::EphemeralPage { file_id, blkno } => {
-                let mut map = self.ephemeral_page_map.write().unwrap();
-                match map.entry((*file_id, *blkno)) {
-                    Entry::Occupied(entry) => Some(*entry.get()),
-                    Entry::Vacant(entry) => {
-                        entry.insert(slot_idx);
-                        self.size_metrics.current_bytes_ephemeral.add_page_sz(1);
-                        None
-                    }
-                }
-            }
+
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let mut map = self.immutable_page_map.write().unwrap();
                match map.entry((*file_id, *blkno)) {
@@ -837,25 +801,8 @@ impl PageCache {
                    }
                };
                if let Some(old_key) = &inner.key {
-                    if inner.dirty {
-                        if let Err(err) = Self::writeback(old_key, inner.buf) {
-                            // Writing the page to disk failed.
-                            //
-                            // FIXME: What to do here, when? We could propagate the error to the
-                            // caller, but victim buffer is generally unrelated to the original
-                            // call. It can even belong to a different tenant. Currently, we
-                            // report the error to the log and continue the clock sweep to find
-                            // a different victim. But if the problem persists, the page cache
-                            // could fill up with dirty pages that we cannot evict, and we will
-                            // loop retrying the writebacks indefinitely.
-                            error!("writeback of buffer {:?} failed: {}", old_key, err);
-                            continue;
-                        }
-                    }
-
                    // remove mapping for old buffer
                    self.remove_mapping(old_key);
-                    inner.dirty = false;
                    inner.key = None;
                }
                return Ok((slot_idx, inner));
@@ -863,28 +810,6 @@ impl PageCache {
        }
    }

-    fn writeback(cache_key: &CacheKey, buf: &[u8]) -> Result<(), std::io::Error> {
-        match cache_key {
-            CacheKey::MaterializedPage {
-                hash_key: _,
-                lsn: _,
-            } => Err(std::io::Error::new(
-                std::io::ErrorKind::Other,
-                "unexpected dirty materialized page",
-            )),
-            CacheKey::EphemeralPage { file_id, blkno } => {
-                writeback_ephemeral_file(*file_id, *blkno, buf)
-            }
-            CacheKey::ImmutableFilePage {
-                file_id: _,
-                blkno: _,
-            } => Err(std::io::Error::new(
-                std::io::ErrorKind::Other,
-                "unexpected dirty immutable page",
-            )),
-        }
-    }
-
    /// Initialize a new page cache
    ///
    /// This should be called only once at page server startup.
@@ -895,7 +820,6 @@ impl PageCache {

        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
        size_metrics.max_bytes.set_page_sz(num_pages);
-        size_metrics.current_bytes_ephemeral.set_page_sz(0);
        size_metrics.current_bytes_immutable.set_page_sz(0);
        size_metrics.current_bytes_materialized_page.set_page_sz(0);

@@ -905,11 +829,7 @@ impl PageCache {
                let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();

                Slot {
-                    inner: RwLock::new(SlotInner {
-                        key: None,
-                        buf,
-                        dirty: false,
-                    }),
+                    inner: RwLock::new(SlotInner { key: None, buf }),
                    usage_count: AtomicU8::new(0),
                }
            })
@@ -917,7 +837,6 @@ impl PageCache {

        Self {
            materialized_page_map: Default::default(),
-            ephemeral_page_map: Default::default(),
            immutable_page_map: Default::default(),
            slots,
            next_evict_slot: AtomicUsize::new(0),
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -50,7 +50,8 @@ use crate::basebackup;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
-use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
+use crate::metrics;
+use crate::metrics::LIVE_CONNECTIONS_COUNT;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant;
@@ -306,39 +307,6 @@ async fn page_service_conn_main(
    }
 }

-struct PageRequestMetrics {
-    get_rel_exists: metrics::Histogram,
-    get_rel_size: metrics::Histogram,
-    get_page_at_lsn: metrics::Histogram,
-    get_db_size: metrics::Histogram,
-}
-
-impl PageRequestMetrics {
-    fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
-        let tenant_id = tenant_id.to_string();
-        let timeline_id = timeline_id.to_string();
-
-        let get_rel_exists =
-            SMGR_QUERY_TIME.with_label_values(&["get_rel_exists", &tenant_id, &timeline_id]);
-
-        let get_rel_size =
-            SMGR_QUERY_TIME.with_label_values(&["get_rel_size", &tenant_id, &timeline_id]);
-
-        let get_page_at_lsn =
-            SMGR_QUERY_TIME.with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id]);
-
-        let get_db_size =
-            SMGR_QUERY_TIME.with_label_values(&["get_db_size", &tenant_id, &timeline_id]);
-
-        Self {
-            get_rel_exists,
-            get_rel_size,
-            get_page_at_lsn,
-            get_db_size,
-        }
-    }
-}
-
 struct PageServerHandler {
    _conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
@@ -406,7 +374,7 @@ impl PageServerHandler {
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
        pgb.flush().await?;

-        let metrics = PageRequestMetrics::new(&tenant_id, &timeline_id);
+        let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);

        loop {
            let msg = tokio::select! {
@@ -446,21 +414,21 @@ impl PageServerHandler {

            let response = match neon_fe_msg {
                PagestreamFeMessage::Exists(req) => {
-                    let _timer = metrics.get_rel_exists.start_timer();
+                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelExists);
                    self.handle_get_rel_exists_request(&timeline, &req, &ctx)
                        .await
                }
                PagestreamFeMessage::Nblocks(req) => {
-                    let _timer = metrics.get_rel_size.start_timer();
+                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelSize);
                    self.handle_get_nblocks_request(&timeline, &req, &ctx).await
                }
                PagestreamFeMessage::GetPage(req) => {
-                    let _timer = metrics.get_page_at_lsn.start_timer();
+                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetPageAtLsn);
                    self.handle_get_page_at_lsn_request(&timeline, &req, &ctx)
                        .await
                }
                PagestreamFeMessage::DbSize(req) => {
-                    let _timer = metrics.get_db_size.start_timer();
+                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetDbSize);
                    self.handle_db_size_request(&timeline, &req, &ctx).await
                }
            };
@@ -984,8 +952,8 @@ where
                false
            };

-            metrics::metric_vec_duration::observe_async_block_duration_by_result(
-                &*crate::metrics::BASEBACKUP_QUERY_TIME,
+            ::metrics::metric_vec_duration::observe_async_block_duration_by_result(
+                &*metrics::BASEBACKUP_QUERY_TIME,
                async move {
                    self.handle_basebackup_request(
                        pgb,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -28,6 +28,8 @@ use std::cmp::min;
 use std::collections::hash_map::Entry;
 use std::collections::BTreeSet;
 use std::collections::HashMap;
+use std::fmt::Debug;
+use std::fmt::Display;
 use std::fs;
 use std::fs::File;
 use std::fs::OpenOptions;
@@ -46,16 +48,20 @@ use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};

 use self::config::TenantConf;
+use self::delete::DeleteTenantFlow;
 use self::metadata::LoadMetadataError;
 use self::metadata::TimelineMetadata;
+use self::mgr::TenantsMap;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineUninitMark;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
+use self::timeline::TimelineResources;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir;
 use crate::is_uninit_mark;
+use crate::metrics::TENANT_ACTIVATION;
 use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC};
 use crate::repository::GcResult;
 use crate::task_mgr;
@@ -66,7 +72,6 @@ use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
-use crate::tenant::storage_layer::Layer;
 use crate::InitializationOrder;

 use crate::tenant::timeline::delete::DeleteTimelineFlow;
@@ -105,6 +110,7 @@ macro_rules! pausable_failpoint {

 pub mod blob_io;
 pub mod block_io;
+
 pub mod disk_btree;
 pub(crate) mod ephemeral_file;
 pub mod layer_map;
@@ -117,6 +123,7 @@ mod remote_timeline_client;
 pub mod storage_layer;

 pub mod config;
+pub mod delete;
 pub mod mgr;
 pub mod tasks;
 pub mod upload_queue;
@@ -126,12 +133,7 @@ pub(crate) mod timeline;
 pub mod size;

 pub(crate) use timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
-pub use timeline::{
-    LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline,
-};
-
-// re-export this function so that page_cache.rs can use it.
-pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
+pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};

 // re-export for use in remote_timeline_client.rs
 pub use crate::tenant::metadata::save_metadata;
@@ -144,6 +146,16 @@ pub const TIMELINES_SEGMENT_NAME: &str = "timelines";

 pub const TENANT_ATTACHING_MARKER_FILENAME: &str = "attaching";

+pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
+
+/// References to shared objects that are passed into each tenant, such
+/// as the shared remote storage client and process initialization state.
+#[derive(Clone)]
+pub struct TenantSharedResources {
+    pub broker_client: storage_broker::BrokerClientChannel,
+    pub remote_storage: Option<GenericRemoteStorage>,
+}
+
 ///
 /// Tenant consists of multiple timelines. Keep them in a hash table.
 ///
@@ -182,6 +194,8 @@ pub struct Tenant {
    cached_synthetic_tenant_size: Arc<AtomicU64>,

    eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
+
+    pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
 }

 // We should not blindly overwrite local metadata with remote one.
@@ -273,7 +287,7 @@ pub enum LoadLocalTimelineError {
    ResumeDeletion(#[source] anyhow::Error),
 }

-#[derive(Debug, thiserror::Error)]
+#[derive(thiserror::Error)]
 pub enum DeleteTimelineError {
    #[error("NotFound")]
    NotFound,
@@ -282,17 +296,37 @@ pub enum DeleteTimelineError {
    HasChildren(Vec<TimelineId>),

    #[error("Timeline deletion is already in progress")]
-    AlreadyInProgress,
+    AlreadyInProgress(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>),

    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }

+impl Debug for DeleteTimelineError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::NotFound => write!(f, "NotFound"),
+            Self::HasChildren(c) => f.debug_tuple("HasChildren").field(c).finish(),
+            Self::AlreadyInProgress(_) => f.debug_tuple("AlreadyInProgress").finish(),
+            Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
+        }
+    }
+}
+
 pub enum SetStoppingError {
    AlreadyStopping(completion::Barrier),
    Broken,
 }

+impl Debug for SetStoppingError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::AlreadyStopping(_) => f.debug_tuple("AlreadyStopping").finish(),
+            Self::Broken => write!(f, "Broken"),
+        }
+    }
+}
+
 struct RemoteStartupData {
    index_part: IndexPart,
    remote_metadata: TimelineMetadata,
@@ -361,7 +395,7 @@ impl Tenant {
    async fn timeline_init_and_sync(
        &self,
        timeline_id: TimelineId,
-        remote_client: Option<RemoteTimelineClient>,
+        resources: TimelineResources,
        remote_startup_data: Option<RemoteStartupData>,
        local_metadata: Option<TimelineMetadata>,
        ancestor: Option<Arc<Timeline>>,
@@ -382,17 +416,57 @@ impl Tenant {
            timeline_id,
            up_to_date_metadata,
            ancestor.clone(),
-            remote_client,
+            resources,
            init_order,
            CreateTimelineCause::Load,
        )?;
-        let new_disk_consistent_lsn = timeline.get_disk_consistent_lsn();
+        let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
        anyhow::ensure!(
-            new_disk_consistent_lsn.is_valid(),
+            disk_consistent_lsn.is_valid(),
            "Timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
        );
+        assert_eq!(
+            disk_consistent_lsn,
+            up_to_date_metadata.disk_consistent_lsn(),
+            "these are used interchangeably"
+        );
+
+        // Save the metadata file to local disk.
+        if !picked_local {
+            save_metadata(
+                self.conf,
+                &tenant_id,
+                &timeline_id,
+                up_to_date_metadata,
+                first_save,
+            )
+            .context("save_metadata")?;
+        }
+
+        let index_part = remote_startup_data.as_ref().map(|x| &x.index_part);
+
+        if let Some(index_part) = index_part {
+            timeline
+                .remote_client
+                .as_ref()
+                .unwrap()
+                .init_upload_queue(index_part)?;
+        } else if self.remote_storage.is_some() {
+            // No data on the remote storage, but we have local metadata file. We can end up
+            // here with timeline_create being interrupted before finishing index part upload.
+            // By doing what we do here, the index part upload is retried.
+            // If control plane retries timeline creation in the meantime, the mgmt API handler
+            // for timeline creation will coalesce on the upload we queue here.
+            let rtc = timeline.remote_client.as_ref().unwrap();
+            rtc.init_upload_queue_for_empty_remote(up_to_date_metadata)?;
+            rtc.schedule_index_upload_for_metadata_update(up_to_date_metadata)?;
+        }
+
        timeline
-            .load_layer_map(new_disk_consistent_lsn)
+            .load_layer_map(
+                disk_consistent_lsn,
+                remote_startup_data.map(|x| x.index_part),
+            )
            .await
            .with_context(|| {
                format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}")
@@ -416,19 +490,6 @@ impl Tenant {
            }
        };

-        if self.remote_storage.is_some() {
-            // Reconcile local state with remote storage, downloading anything that's
-            // missing locally, and scheduling uploads for anything that's missing
-            // in remote storage.
-            timeline
-                .reconcile_with_remote(
-                    up_to_date_metadata,
-                    remote_startup_data.as_ref().map(|r| &r.index_part),
-                )
-                .await
-                .context("failed to reconcile with remote")?
-        }
-
        // Sanity check: a timeline should have some content.
        anyhow::ensure!(
            ancestor.is_some()
@@ -443,18 +504,6 @@ impl Tenant {
            "Timeline has no ancestor and no layer files"
        );

-        // Save the metadata file to local disk.
-        if !picked_local {
-            save_metadata(
-                self.conf,
-                &tenant_id,
-                &timeline_id,
-                up_to_date_metadata,
-                first_save,
-            )
-            .context("save_metadata")?;
-        }
-
        Ok(())
    }

@@ -472,6 +521,7 @@ impl Tenant {
        conf: &'static PageServerConf,
        tenant_id: TenantId,
        broker_client: storage_broker::BrokerClientChannel,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        remote_storage: GenericRemoteStorage,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Tenant>> {
@@ -486,7 +536,7 @@ impl Tenant {
            tenant_conf,
            wal_redo_manager,
            tenant_id,
-            Some(remote_storage),
+            Some(remote_storage.clone()),
        ));

        // Do all the hard work in the background
@@ -501,17 +551,61 @@ impl Tenant {
            "attach tenant",
            false,
            async move {
+                // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
+                let make_broken = |t: &Tenant, err: anyhow::Error| {
+                    error!("attach failed, setting tenant state to Broken: {err:?}");
+                    t.state.send_modify(|state| {
+                        assert_eq!(
+                            *state,
+                            TenantState::Attaching,
+                            "the attach task owns the tenant state until activation is complete"
+                        );
+                        *state = TenantState::broken_from_reason(err.to_string());
+                    });
+                };
+
+                let pending_deletion = {
+                    match DeleteTenantFlow::should_resume_deletion(
+                        conf,
+                        Some(&remote_storage),
+                        &tenant_clone,
+                    )
+                    .await
+                    {
+                        Ok(should_resume_deletion) => should_resume_deletion,
+                        Err(err) => {
+                            make_broken(&tenant_clone, anyhow::anyhow!(err));
+                            return Ok(());
+                        }
+                    }
+                };
+
+                info!("pending_deletion {}", pending_deletion.is_some());
+
+                if let Some(deletion) = pending_deletion {
+                    match DeleteTenantFlow::resume_from_attach(
+                        deletion,
+                        &tenant_clone,
+                        tenants,
+                        &ctx,
+                    )
+                    .await
+                    {
+                        Err(err) => {
+                            make_broken(&tenant_clone, anyhow::anyhow!(err));
+                            return Ok(());
+                        }
+                        Ok(()) => return Ok(()),
+                    }
+                }
+
                match tenant_clone.attach(&ctx).await {
                    Ok(()) => {
                        info!("attach finished, activating");
                        tenant_clone.activate(broker_client, None, &ctx);
                    }
                    Err(e) => {
-                        error!("attach failed, setting tenant state to Broken: {:?}", e);
-                        tenant_clone.state.send_modify(|state| {
-                            assert_eq!(*state, TenantState::Attaching, "the attach task owns the tenant state until activation is complete");
-                            *state = TenantState::broken_from_reason(e.to_string());
-                        });
+                        make_broken(&tenant_clone, anyhow::anyhow!(e));
                    }
                }
                Ok(())
@@ -589,6 +683,9 @@ impl Tenant {
                .instrument(info_span!("download_index_part", %timeline_id)),
            );
        }
+
+        let mut timelines_to_resume_deletions = vec![];
+
        // Wait for all the download tasks to complete & collect results.
        let mut remote_index_and_client = HashMap::new();
        let mut timeline_ancestors = HashMap::new();
@@ -599,15 +696,15 @@ impl Tenant {
            debug!("successfully downloaded index part for timeline {timeline_id}");
            match index_part {
                MaybeDeletedIndexPart::IndexPart(index_part) => {
-                    timeline_ancestors.insert(
-                        timeline_id,
-                        index_part.parse_metadata().context("parse_metadata")?,
-                    );
+                    timeline_ancestors.insert(timeline_id, index_part.metadata.clone());
                    remote_index_and_client.insert(timeline_id, (index_part, client));
                }
-                MaybeDeletedIndexPart::Deleted(_) => {
-                    info!("timeline {} is deleted, skipping", timeline_id);
-                    continue;
+                MaybeDeletedIndexPart::Deleted(index_part) => {
+                    info!(
+                        "timeline {} is deleted, picking to resume deletion",
+                        timeline_id
+                    );
+                    timelines_to_resume_deletions.push((timeline_id, index_part, client));
                }
            }
        }
@@ -615,21 +712,48 @@ impl Tenant {
        // For every timeline, download the metadata file, scan the local directory,
        // and build a layer map that contains an entry for each remote and local
        // layer file.
-        let sorted_timelines = tree_sort_timelines(timeline_ancestors)?;
+        let sorted_timelines = tree_sort_timelines(timeline_ancestors, |m| m.ancestor_timeline())?;
        for (timeline_id, remote_metadata) in sorted_timelines {
            let (index_part, remote_client) = remote_index_and_client
                .remove(&timeline_id)
                .expect("just put it in above");

            // TODO again handle early failure
-            self.load_remote_timeline(timeline_id, index_part, remote_metadata, remote_client, ctx)
-                .await
-                .with_context(|| {
-                    format!(
-                        "failed to load remote timeline {} for tenant {}",
-                        timeline_id, self.tenant_id
-                    )
-                })?;
+            self.load_remote_timeline(
+                timeline_id,
+                index_part,
+                remote_metadata,
+                TimelineResources {
+                    remote_client: Some(remote_client),
+                },
+                ctx,
+            )
+            .await
+            .with_context(|| {
+                format!(
+                    "failed to load remote timeline {} for tenant {}",
+                    timeline_id, self.tenant_id
+                )
+            })?;
+        }
+
+        // Walk through deleted timelines, resume deletion
+        for (timeline_id, index_part, remote_timeline_client) in timelines_to_resume_deletions {
+            remote_timeline_client
+                .init_upload_queue_stopped_to_continue_deletion(&index_part)
+                .context("init queue stopped")
+                .map_err(LoadLocalTimelineError::ResumeDeletion)?;
+
+            DeleteTimelineFlow::resume_deletion(
+                Arc::clone(self),
+                timeline_id,
+                &index_part.metadata,
+                Some(remote_timeline_client),
+                None,
+            )
+            .await
+            .context("resume_deletion")
+            .map_err(LoadLocalTimelineError::ResumeDeletion)?;
        }

        std::fs::remove_file(&marker_file)
@@ -637,27 +761,26 @@ impl Tenant {
        crashsafe::fsync(marker_file.parent().expect("marker file has parent dir"))
            .context("fsync tenant directory after unlinking attach marker file")?;

-        utils::failpoint_sleep_millis_async!("attach-before-activate");
+        crate::failpoint_support::sleep_millis_async!("attach-before-activate");

        info!("Done");

        Ok(())
    }

-    /// get size of all remote timelines
+    /// Get sum of all remote timelines sizes
    ///
    /// This function relies on the index_part instead of listing the remote storage
-    ///
-    pub async fn get_remote_size(&self) -> anyhow::Result<u64> {
+    pub fn remote_size(&self) -> u64 {
        let mut size = 0;

-        for timeline in self.list_timelines().iter() {
+        for timeline in self.list_timelines() {
            if let Some(remote_client) = &timeline.remote_client {
                size += remote_client.get_remote_physical_size();
            }
        }

-        Ok(size)
+        size
    }

    #[instrument(skip_all, fields(timeline_id=%timeline_id))]
@@ -666,7 +789,7 @@ impl Tenant {
        timeline_id: TimelineId,
        index_part: IndexPart,
        remote_metadata: TimelineMetadata,
-        remote_client: RemoteTimelineClient,
+        resources: TimelineResources,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        span::debug_assert_current_span_has_tenant_id();
@@ -696,7 +819,7 @@ impl Tenant {

        self.timeline_init_and_sync(
            timeline_id,
-            Some(remote_client),
+            resources,
            Some(RemoteStartupData {
                index_part,
                remote_metadata,
@@ -740,12 +863,12 @@ impl Tenant {
    /// If the loading fails for some reason, the Tenant will go into Broken
    /// state.
    #[instrument(skip_all, fields(tenant_id=%tenant_id))]
-    pub fn spawn_load(
+    pub(crate) fn spawn_load(
        conf: &'static PageServerConf,
        tenant_id: TenantId,
-        broker_client: storage_broker::BrokerClientChannel,
-        remote_storage: Option<GenericRemoteStorage>,
+        resources: TenantSharedResources,
        init_order: Option<InitializationOrder>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        ctx: &RequestContext,
    ) -> Arc<Tenant> {
        span::debug_assert_current_span_has_tenant_id();
@@ -758,6 +881,9 @@ impl Tenant {
            }
        };

+        let broker_client = resources.broker_client;
+        let remote_storage = resources.remote_storage;
+
        let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
        let tenant = Tenant::new(
            TenantState::Loading,
@@ -765,7 +891,7 @@ impl Tenant {
            tenant_conf,
            wal_redo_manager,
            tenant_id,
-            remote_storage,
+            remote_storage.clone(),
        );
        let tenant = Arc::new(tenant);

@@ -781,27 +907,84 @@ impl Tenant {
            "initial tenant load",
            false,
            async move {
+                // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
+                let make_broken = |t: &Tenant, err: anyhow::Error| {
+                    error!("load failed, setting tenant state to Broken: {err:?}");
+                    t.state.send_modify(|state| {
+                        assert!(
+                            matches!(*state, TenantState::Loading | TenantState::Stopping { .. }),
+                            "the loading task owns the tenant state until activation is complete"
+                        );
+                        *state = TenantState::broken_from_reason(err.to_string());
+                    });
+                };
+
                let mut init_order = init_order;

                // take the completion because initial tenant loading will complete when all of
                // these tasks complete.
-                let _completion = init_order.as_mut().and_then(|x| x.initial_tenant_load.take());
+                let _completion = init_order
+                    .as_mut()
+                    .and_then(|x| x.initial_tenant_load.take());
+
+                // Dont block pageserver startup on figuring out deletion status
+                let pending_deletion = {
+                    match DeleteTenantFlow::should_resume_deletion(
+                        conf,
+                        remote_storage.as_ref(),
+                        &tenant_clone,
+                    )
+                    .await
+                    {
+                        Ok(should_resume_deletion) => should_resume_deletion,
+                        Err(err) => {
+                            make_broken(&tenant_clone, anyhow::anyhow!(err));
+                            return Ok(());
+                        }
+                    }
+                };
+
+                info!("pending deletion {}", pending_deletion.is_some());
+
+                if let Some(deletion) = pending_deletion {
+                    // as we are no longer loading, signal completion by dropping
+                    // the completion while we resume deletion
+                    drop(_completion);
+                    // do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout
+                    let _ = init_order
+                        .as_mut()
+                        .and_then(|x| x.initial_logical_size_attempt.take());
+
+                    match DeleteTenantFlow::resume_from_load(
+                        deletion,
+                        &tenant_clone,
+                        init_order.as_ref(),
+                        tenants,
+                        &ctx,
+                    )
+                    .await
+                    {
+                        Err(err) => {
+                            make_broken(&tenant_clone, anyhow::anyhow!(err));
+                            return Ok(());
+                        }
+                        Ok(()) => return Ok(()),
+                    }
+                }
+
+                let background_jobs_can_start =
+                    init_order.as_ref().map(|x| &x.background_jobs_can_start);

                match tenant_clone.load(init_order.as_ref(), &ctx).await {
                    Ok(()) => {
-                        debug!("load finished, activating");
-                        let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
+                        debug!("load finished");
+
                        tenant_clone.activate(broker_client, background_jobs_can_start, &ctx);
                    }
-                    Err(err) => {
-                        error!("load failed, setting tenant state to Broken: {err:?}");
-                        tenant_clone.state.send_modify(|state| {
-                            assert_eq!(*state, TenantState::Loading, "the loading task owns the tenant state until activation is complete");
-                            *state = TenantState::broken_from_reason(err.to_string());
-                        });
-                    }
+                    Err(err) => make_broken(&tenant_clone, err),
                }
-               Ok(())
+
+                Ok(())
            }
            .instrument({
                let span = tracing::info_span!(parent: None, "load", tenant_id=%tenant_id);
@@ -877,6 +1060,8 @@ impl Tenant {
                        )
                    })?;

+                info!("Found deletion mark for timeline {}", timeline_id);
+
                match load_metadata(self.conf, &self.tenant_id, &timeline_id) {
                    Ok(metadata) => {
                        timelines_to_resume_deletion.push((timeline_id, Some(metadata)))
@@ -966,9 +1151,11 @@ impl Tenant {

        // Sort the array of timeline IDs into tree-order, so that parent comes before
        // all its children.
-        tree_sort_timelines(timelines_to_load).map(|sorted_timelines| TenantDirectoryScan {
-            sorted_timelines_to_load: sorted_timelines,
-            timelines_to_resume_deletion,
+        tree_sort_timelines(timelines_to_load, |m| m.ancestor_timeline()).map(|sorted_timelines| {
+            TenantDirectoryScan {
+                sorted_timelines_to_load: sorted_timelines,
+                timelines_to_resume_deletion,
+            }
        })
    }

@@ -986,7 +1173,7 @@ impl Tenant {

        debug!("loading tenant task");

-        utils::failpoint_sleep_millis_async!("before-loading-tenant");
+        crate::failpoint_support::sleep_millis_async!("before-loading-tenant");

        // Load in-memory state to reflect the local files on disk
        //
@@ -1014,8 +1201,9 @@ impl Tenant {
            {
                match e {
                    LoadLocalTimelineError::Load(source) => {
-                        return Err(anyhow::anyhow!(source)
-                            .context("Failed to load local timeline: {timeline_id}"))
+                        return Err(anyhow::anyhow!(source)).with_context(|| {
+                            format!("Failed to load local timeline: {timeline_id}")
+                        })
                    }
                    LoadLocalTimelineError::ResumeDeletion(source) => {
                        // Make sure resumed deletion wont fail loading for entire tenant.
@@ -1081,16 +1269,9 @@ impl Tenant {
    ) -> Result<(), LoadLocalTimelineError> {
        span::debug_assert_current_span_has_tenant_id();

-        let remote_client = self.remote_storage.as_ref().map(|remote_storage| {
-            RemoteTimelineClient::new(
-                remote_storage.clone(),
-                self.conf,
-                self.tenant_id,
-                timeline_id,
-            )
-        });
+        let mut resources = self.build_timeline_resources(timeline_id);

-        let (remote_startup_data, remote_client) = match remote_client {
+        let (remote_startup_data, remote_client) = match resources.remote_client {
            Some(remote_client) => match remote_client.download_index_file().await {
                Ok(index_part) => {
                    let index_part = match index_part {
@@ -1128,10 +1309,7 @@ impl Tenant {
                        }
                    };

-                    let remote_metadata = index_part
-                        .parse_metadata()
-                        .context("parse_metadata")
-                        .map_err(LoadLocalTimelineError::Load)?;
+                    let remote_metadata = index_part.metadata.clone();
                    (
                        Some(RemoteStartupData {
                            index_part,
@@ -1178,9 +1356,10 @@ impl Tenant {
                    return Ok(());
                }

-                (None, remote_client)
+                (None, resources.remote_client)
            }
        };
+        resources.remote_client = remote_client;

        let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
            let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
@@ -1193,7 +1372,7 @@ impl Tenant {

        self.timeline_init_and_sync(
            timeline_id,
-            remote_client,
+            resources,
            remote_startup_data,
            Some(local_metadata),
            ancestor,
@@ -1640,6 +1819,8 @@ impl Tenant {
                    post_state = <&'static str>::from(&*current_state),
                    "activation attempt finished"
                );
+
+                TENANT_ACTIVATION.observe(elapsed.as_secs_f64());
            });
        }
    }
@@ -1680,7 +1861,7 @@ impl Tenant {
        // It's mesed up.
        // we just ignore the failure to stop

-        match self.set_stopping(shutdown_progress).await {
+        match self.set_stopping(shutdown_progress, false, false).await {
            Ok(()) => {}
            Err(SetStoppingError::Broken) => {
                // assume that this is acceptable
@@ -1720,18 +1901,28 @@ impl Tenant {
    /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
    ///
    /// This function is not cancel-safe!
-    async fn set_stopping(&self, progress: completion::Barrier) -> Result<(), SetStoppingError> {
+    ///
+    /// `allow_transition_from_loading` is needed for the special case of loading task deleting the tenant.
+    /// `allow_transition_from_attaching` is needed for the special case of attaching deleted tenant.
+    async fn set_stopping(
+        &self,
+        progress: completion::Barrier,
+        allow_transition_from_loading: bool,
+        allow_transition_from_attaching: bool,
+    ) -> Result<(), SetStoppingError> {
        let mut rx = self.state.subscribe();

        // cannot stop before we're done activating, so wait out until we're done activating
        rx.wait_for(|state| match state {
-            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+            TenantState::Attaching if allow_transition_from_attaching => true,
+            TenantState::Activating(_) | TenantState::Attaching => {
                info!(
                    "waiting for {} to turn Active|Broken|Stopping",
                    <&'static str>::from(state)
                );
                false
            }
+            TenantState::Loading => allow_transition_from_loading,
            TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
        })
        .await
@@ -1740,8 +1931,22 @@ impl Tenant {
        // we now know we're done activating, let's see whether this task is the winner to transition into Stopping
        let mut err = None;
        let stopping = self.state.send_if_modified(|current_state| match current_state {
-            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
-                unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
+            TenantState::Activating(_) => {
+                unreachable!("1we ensured above that we're done with activation, and, there is no re-activation")
+            }
+            TenantState::Attaching => {
+                if !allow_transition_from_attaching {
+                    unreachable!("2we ensured above that we're done with activation, and, there is no re-activation")
+                };
+                *current_state = TenantState::Stopping { progress };
+                true
+            }
+            TenantState::Loading => {
+                if !allow_transition_from_loading {
+                    unreachable!("3we ensured above that we're done with activation, and, there is no re-activation")
+                };
+                *current_state = TenantState::Stopping { progress };
+                true
            }
            TenantState::Active => {
                // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
@@ -1811,6 +2016,11 @@ impl Tenant {
        .expect("cannot drop self.state while on a &self method");

        // we now know we're done activating, let's see whether this task is the winner to transition into Broken
+        self.set_broken_no_wait(reason)
+    }
+
+    pub(crate) fn set_broken_no_wait(&self, reason: impl Display) {
+        let reason = reason.to_string();
        self.state.send_modify(|current_state| {
            match *current_state {
                TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
@@ -1876,22 +2086,28 @@ impl Tenant {
 /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
 /// perform a topological sort, so that the parent of each timeline comes
 /// before the children.
-fn tree_sort_timelines(
-    timelines: HashMap<TimelineId, TimelineMetadata>,
-) -> anyhow::Result<Vec<(TimelineId, TimelineMetadata)>> {
+/// E extracts the ancestor from T
+/// This allows for T to be different. It can be TimelineMetadata, can be Timeline itself, etc.
+fn tree_sort_timelines<T, E>(
+    timelines: HashMap<TimelineId, T>,
+    extractor: E,
+) -> anyhow::Result<Vec<(TimelineId, T)>>
+where
+    E: Fn(&T) -> Option<TimelineId>,
+{
    let mut result = Vec::with_capacity(timelines.len());

    let mut now = Vec::with_capacity(timelines.len());
    // (ancestor, children)
-    let mut later: HashMap<TimelineId, Vec<(TimelineId, TimelineMetadata)>> =
+    let mut later: HashMap<TimelineId, Vec<(TimelineId, T)>> =
        HashMap::with_capacity(timelines.len());

-    for (timeline_id, metadata) in timelines {
-        if let Some(ancestor_id) = metadata.ancestor_timeline() {
+    for (timeline_id, value) in timelines {
+        if let Some(ancestor_id) = extractor(&value) {
            let children = later.entry(ancestor_id).or_default();
-            children.push((timeline_id, metadata));
+            children.push((timeline_id, value));
        } else {
-            now.push((timeline_id, metadata));
+            now.push((timeline_id, value));
        }
    }

@@ -2028,7 +2244,7 @@ impl Tenant {
        new_timeline_id: TimelineId,
        new_metadata: &TimelineMetadata,
        ancestor: Option<Arc<Timeline>>,
-        remote_client: Option<RemoteTimelineClient>,
+        resources: TimelineResources,
        init_order: Option<&InitializationOrder>,
        cause: CreateTimelineCause,
    ) -> anyhow::Result<Arc<Timeline>> {
@@ -2057,10 +2273,10 @@ impl Tenant {
            new_timeline_id,
            self.tenant_id,
            Arc::clone(&self.walredo_mgr),
-            remote_client,
+            resources,
            pg_version,
            initial_logical_size_can_start.cloned(),
-            initial_logical_size_attempt.cloned(),
+            initial_logical_size_attempt.cloned().flatten(),
            state,
        );

@@ -2144,6 +2360,7 @@ impl Tenant {
            cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
            cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
            eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
+            delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
        }
    }

@@ -2160,6 +2377,7 @@ impl Tenant {
        // FIXME If the config file is not found, assume that we're attaching
        // a detached tenant and config is passed via attach command.
        // https://github.com/neondatabase/neon/issues/1555
+        // OR: we're loading after incomplete deletion that managed to remove config.
        if !target_config_path.exists() {
            info!("tenant config not found in {target_config_display}");
            return Ok(TenantConfOpt::default());
@@ -2297,7 +2515,9 @@ impl Tenant {
            .refresh_gc_info_internal(target_timeline_id, horizon, pitr, ctx)
            .await?;

-        utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
+        crate::failpoint_support::sleep_millis_async!(
+            "gc_iteration_internal_after_getting_gc_timelines"
+        );

        // If there is nothing to GC, we don't want any messages in the INFO log.
        if !gc_timelines.is_empty() {
@@ -2701,6 +2921,23 @@ impl Tenant {
        Ok(timeline)
    }

+    /// Call this before constructing a timeline, to build its required structures
+    fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources {
+        let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
+            let remote_client = RemoteTimelineClient::new(
+                remote_storage.clone(),
+                self.conf,
+                self.tenant_id,
+                timeline_id,
+            );
+            Some(remote_client)
+        } else {
+            None
+        };
+
+        TimelineResources { remote_client }
+    }
+
    /// Creates intermediate timeline structure and its files.
    ///
    /// An empty layer map is initialized, and new data and WAL can be imported starting
@@ -2717,25 +2954,17 @@ impl Tenant {
    ) -> anyhow::Result<UninitializedTimeline> {
        let tenant_id = self.tenant_id;

-        let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
-            let remote_client = RemoteTimelineClient::new(
-                remote_storage.clone(),
-                self.conf,
-                tenant_id,
-                new_timeline_id,
-            );
+        let resources = self.build_timeline_resources(new_timeline_id);
+        if let Some(remote_client) = &resources.remote_client {
            remote_client.init_upload_queue_for_empty_remote(new_metadata)?;
-            Some(remote_client)
-        } else {
-            None
-        };
+        }

        let timeline_struct = self
            .create_timeline_struct(
                new_timeline_id,
                new_metadata,
                ancestor,
-                remote_client,
+                resources,
                None,
                CreateTimelineCause::Load,
            )
@@ -2889,7 +3118,7 @@ impl Tenant {
            .set(size);
    }

-    pub fn get_cached_synthetic_size(&self) -> u64 {
+    pub fn cached_synthetic_size(&self) -> u64 {
        self.cached_synthetic_tenant_size.load(Ordering::Relaxed)
    }
 }
@@ -3808,6 +4037,34 @@ mod tests {
        Ok(())
    }

+    #[tokio::test]
+    async fn delta_layer_dumping() -> anyhow::Result<()> {
+        use storage_layer::AsLayerDesc;
+        let (tenant, ctx) = TenantHarness::create("test_layer_dumping")?.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
+
+        let layer_map = tline.layers.read().await;
+        let level0_deltas = layer_map
+            .layer_map()
+            .get_level0_deltas()?
+            .into_iter()
+            .map(|desc| layer_map.get_from_desc(&desc))
+            .collect::<Vec<_>>();
+
+        assert!(!level0_deltas.is_empty());
+
+        for delta in level0_deltas {
+            // Ensure we are dumping a delta layer here
+            assert!(delta.layer_desc().is_delta);
+            delta.dump(true, &ctx).await.unwrap();
+        }
+
+        Ok(())
+    }
+
    #[tokio::test]
    async fn corrupt_metadata() -> anyhow::Result<()> {
        const TEST_NAME: &str = "corrupt_metadata";
@@ -3845,7 +4102,7 @@ mod tests {
        let mut found_error_message = false;
        let mut err_source = err.source();
        while let Some(source) = err_source {
-            if source.to_string() == "metadata checksum mismatch" {
+            if source.to_string().contains("metadata checksum mismatch") {
                found_error_message = true;
                break;
            }
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -12,24 +12,21 @@
 //! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
 use crate::page_cache::PAGE_SZ;
-use crate::tenant::block_io::{BlockCursor, BlockReader};
+use crate::tenant::block_io::BlockCursor;
 use std::cmp::min;
 use std::io::{Error, ErrorKind};

-impl<R> BlockCursor<R>
-where
-    R: BlockReader,
-{
+impl<'a> BlockCursor<'a> {
    /// Read a blob into a new buffer.
-    pub fn read_blob(&mut self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
+    pub async fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
        let mut buf = Vec::new();
-        self.read_blob_into_buf(offset, &mut buf)?;
+        self.read_blob_into_buf(offset, &mut buf).await?;
        Ok(buf)
    }
    /// Read blob into the given buffer. Any previous contents in the buffer
    /// are overwritten.
-    pub fn read_blob_into_buf(
-        &mut self,
+    pub async fn read_blob_into_buf(
+        &self,
        offset: u64,
        dstbuf: &mut Vec<u8>,
    ) -> Result<(), std::io::Error> {
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -2,12 +2,14 @@
 //! Low-level Block-oriented I/O functions
 //!

-use crate::page_cache;
-use crate::page_cache::{ReadBufResult, PAGE_SZ};
+use super::ephemeral_file::EphemeralFile;
+use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
+use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
+use crate::virtual_file::VirtualFile;
 use bytes::Bytes;
+use std::fs::File;
 use std::ops::{Deref, DerefMut};
 use std::os::unix::fs::FileExt;
-use std::sync::atomic::AtomicU64;

 /// This is implemented by anything that can read 8 kB (PAGE_SZ)
 /// blocks, using the page cache
@@ -15,36 +17,82 @@ use std::sync::atomic::AtomicU64;
 /// There are currently two implementations: EphemeralFile, and FileBlockReader
 /// below.
 pub trait BlockReader {
-    type BlockLease: Deref<Target = [u8; PAGE_SZ]> + 'static;
-
-    ///
-    /// Read a block. Returns a "lease" object that can be used to
-    /// access to the contents of the page. (For the page cache, the
-    /// lease object represents a lock on the buffer.)
-    ///
-    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error>;
-
    ///
    /// Create a new "cursor" for reading from this reader.
    ///
    /// A cursor caches the last accessed page, allowing for faster
    /// access if the same block is accessed repeatedly.
-    fn block_cursor(&self) -> BlockCursor<&Self>
-    where
-        Self: Sized,
-    {
-        BlockCursor::new(self)
-    }
+    fn block_cursor(&self) -> BlockCursor<'_>;
 }

 impl<B> BlockReader for &B
 where
    B: BlockReader,
 {
-    type BlockLease = B::BlockLease;
+    fn block_cursor(&self) -> BlockCursor<'_> {
+        (*self).block_cursor()
+    }
+}

-    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
-        (*self).read_blk(blknum)
+/// Reference to an in-memory copy of an immutable on-disk block.
+pub enum BlockLease<'a> {
+    PageReadGuard(PageReadGuard<'static>),
+    EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
+    #[cfg(test)]
+    Rc(std::rc::Rc<[u8; PAGE_SZ]>),
+}
+
+impl From<PageReadGuard<'static>> for BlockLease<'static> {
+    fn from(value: PageReadGuard<'static>) -> BlockLease<'static> {
+        BlockLease::PageReadGuard(value)
+    }
+}
+
+#[cfg(test)]
+impl<'a> From<std::rc::Rc<[u8; PAGE_SZ]>> for BlockLease<'a> {
+    fn from(value: std::rc::Rc<[u8; PAGE_SZ]>) -> Self {
+        BlockLease::Rc(value)
+    }
+}
+
+impl<'a> Deref for BlockLease<'a> {
+    type Target = [u8; PAGE_SZ];
+
+    fn deref(&self) -> &Self::Target {
+        match self {
+            BlockLease::PageReadGuard(v) => v.deref(),
+            BlockLease::EphemeralFileMutableTail(v) => v,
+            #[cfg(test)]
+            BlockLease::Rc(v) => v.deref(),
+        }
+    }
+}
+
+/// Provides the ability to read blocks from different sources,
+/// similar to using traits for this purpose.
+///
+/// Unlike traits, we also support the read function to be async though.
+pub(crate) enum BlockReaderRef<'a> {
+    FileBlockReaderVirtual(&'a FileBlockReader<VirtualFile>),
+    FileBlockReaderFile(&'a FileBlockReader<std::fs::File>),
+    EphemeralFile(&'a EphemeralFile),
+    Adapter(Adapter<&'a DeltaLayerInner>),
+    #[cfg(test)]
+    TestDisk(&'a super::disk_btree::tests::TestDisk),
+}
+
+impl<'a> BlockReaderRef<'a> {
+    #[inline(always)]
+    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+        use BlockReaderRef::*;
+        match self {
+            FileBlockReaderVirtual(r) => r.read_blk(blknum),
+            FileBlockReaderFile(r) => r.read_blk(blknum),
+            EphemeralFile(r) => r.read_blk(blknum),
+            Adapter(r) => r.read_blk(blknum),
+            #[cfg(test)]
+            TestDisk(r) => r.read_blk(blknum),
+        }
    }
 }

@@ -65,26 +113,31 @@ where
 /// // do stuff with 'buf'
 /// ```
 ///
-pub struct BlockCursor<R>
-where
-    R: BlockReader,
-{
-    reader: R,
+pub struct BlockCursor<'a> {
+    reader: BlockReaderRef<'a>,
 }

-impl<R> BlockCursor<R>
-where
-    R: BlockReader,
-{
-    pub fn new(reader: R) -> Self {
+impl<'a> BlockCursor<'a> {
+    pub(crate) fn new(reader: BlockReaderRef<'a>) -> Self {
        BlockCursor { reader }
    }
+    // Needed by cli
+    pub fn new_fileblockreader_virtual(reader: &'a FileBlockReader<VirtualFile>) -> Self {
+        BlockCursor {
+            reader: BlockReaderRef::FileBlockReaderVirtual(reader),
+        }
+    }

-    pub fn read_blk(&mut self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
+    /// Read a block.
+    ///
+    /// Returns a "lease" object that can be used to
+    /// access to the contents of the page. (For the page cache, the
+    /// lease object represents a lock on the buffer.)
+    #[inline(always)]
+    pub fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        self.reader.read_blk(blknum)
    }
 }
-static NEXT_ID: AtomicU64 = AtomicU64::new(1);

 /// An adapter for reading a (virtual) file using the page cache.
 ///
@@ -94,7 +147,7 @@ pub struct FileBlockReader<F> {
    pub file: F,

    /// Unique ID of this file, used as key in the page cache.
-    file_id: u64,
+    file_id: page_cache::FileId,
 }

 impl<F> FileBlockReader<F>
@@ -102,7 +155,7 @@ where
    F: FileExt,
 {
    pub fn new(file: F) -> Self {
-        let file_id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+        let file_id = page_cache::next_file_id();

        FileBlockReader { file_id, file }
    }
@@ -112,16 +165,12 @@ where
        assert!(buf.len() == PAGE_SZ);
        self.file.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
    }
-}
-
-impl<F> BlockReader for FileBlockReader<F>
-where
-    F: FileExt,
-{
-    type BlockLease = page_cache::PageReadGuard<'static>;
-
-    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
-        // Look up the right page
+    /// Read a block.
+    ///
+    /// Returns a "lease" object that can be used to
+    /// access to the contents of the page. (For the page cache, the
+    /// lease object represents a lock on the buffer.)
+    pub fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        let cache = page_cache::get();
        loop {
            match cache
@@ -132,7 +181,7 @@ where
                        format!("Failed to read immutable buf: {e:#}"),
                    )
                })? {
-                ReadBufResult::Found(guard) => break Ok(guard),
+                ReadBufResult::Found(guard) => break Ok(guard.into()),
                ReadBufResult::NotFound(mut write_guard) => {
                    // Read the page from disk into the buffer
                    self.fill_buffer(write_guard.deref_mut(), blknum)?;
@@ -146,6 +195,18 @@ where
    }
 }

+impl BlockReader for FileBlockReader<File> {
+    fn block_cursor(&self) -> BlockCursor<'_> {
+        BlockCursor::new(BlockReaderRef::FileBlockReaderFile(self))
+    }
+}
+
+impl BlockReader for FileBlockReader<VirtualFile> {
+    fn block_cursor(&self) -> BlockCursor<'_> {
+        BlockCursor::new(BlockReaderRef::FileBlockReaderVirtual(self))
+    }
+}
+
 ///
 /// Trait for block-oriented output
 ///
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -0,0 +1,617 @@
+use std::{
+    path::{Path, PathBuf},
+    sync::Arc,
+};
+
+use anyhow::Context;
+use pageserver_api::models::TenantState;
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+use tokio::sync::OwnedMutexGuard;
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info, instrument, warn, Instrument, Span};
+
+use utils::{
+    backoff, completion, crashsafe, fs_ext,
+    id::{TenantId, TimelineId},
+};
+
+use crate::{
+    config::PageServerConf,
+    context::RequestContext,
+    task_mgr::{self, TaskKind},
+    InitializationOrder,
+};
+
+use super::{
+    mgr::{GetTenantError, TenantsMap},
+    remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
+    span,
+    timeline::delete::DeleteTimelineFlow,
+    tree_sort_timelines, DeleteTimelineError, Tenant,
+};
+
+const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;
+
+#[derive(Debug, thiserror::Error)]
+pub enum DeleteTenantError {
+    #[error("GetTenant {0}")]
+    Get(#[from] GetTenantError),
+
+    #[error("Invalid state {0}. Expected Active or Broken")]
+    InvalidState(TenantState),
+
+    #[error("Tenant deletion is already in progress")]
+    AlreadyInProgress,
+
+    #[error("Timeline {0}")]
+    Timeline(#[from] DeleteTimelineError),
+
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
+
+fn remote_tenant_delete_mark_path(
+    conf: &PageServerConf,
+    tenant_id: &TenantId,
+) -> anyhow::Result<RemotePath> {
+    let tenant_remote_path = conf
+        .tenant_path(tenant_id)
+        .strip_prefix(&conf.workdir)
+        .context("Failed to strip workdir prefix")
+        .and_then(RemotePath::new)
+        .context("tenant path")?;
+    Ok(tenant_remote_path.join(Path::new("deleted")))
+}
+
+async fn create_remote_delete_mark(
+    conf: &PageServerConf,
+    remote_storage: &GenericRemoteStorage,
+    tenant_id: &TenantId,
+) -> Result<(), DeleteTenantError> {
+    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?;
+
+    let data: &[u8] = &[];
+    backoff::retry(
+        || async {
+            remote_storage
+                .upload(data, 0, &remote_mark_path, None)
+                .await
+        },
+        |_e| false,
+        FAILED_UPLOAD_WARN_THRESHOLD,
+        FAILED_REMOTE_OP_RETRIES,
+        "mark_upload",
+        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+        backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+    )
+    .await
+    .context("mark_upload")?;
+
+    Ok(())
+}
+
+async fn create_local_delete_mark(
+    conf: &PageServerConf,
+    tenant_id: &TenantId,
+) -> Result<(), DeleteTenantError> {
+    let marker_path = conf.tenant_deleted_mark_file_path(tenant_id);
+
+    // Note: we're ok to replace existing file.
+    let _ = std::fs::OpenOptions::new()
+        .write(true)
+        .create(true)
+        .open(&marker_path)
+        .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
+
+    crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
+
+    Ok(())
+}
+
+async fn schedule_ordered_timeline_deletions(
+    tenant: &Arc<Tenant>,
+) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
+    // Tenant is stopping at this point. We know it will be deleted.
+    // No new timelines should be created.
+    // Tree sort timelines to delete from leafs to the root.
+    // NOTE: by calling clone we release the mutex which creates a possibility for a race: pending deletion
+    // can complete and remove timeline from the map in between our call to clone
+    // and `DeleteTimelineFlow::run`, so `run` wont find timeline in `timelines` map.
+    // timelines.lock is currently synchronous so we cant hold it across await point.
+    // So just ignore NotFound error if we get it from `run`.
+    // Beware: in case it becomes async and we try to hold it here, `run` also locks it, which can create a deadlock.
+    let timelines = tenant.timelines.lock().unwrap().clone();
+    let sorted =
+        tree_sort_timelines(timelines, |t| t.get_ancestor_timeline_id()).context("tree sort")?;
+
+    let mut already_running_deletions = vec![];
+
+    for (timeline_id, _) in sorted.into_iter().rev() {
+        if let Err(e) = DeleteTimelineFlow::run(tenant, timeline_id, true).await {
+            match e {
+                DeleteTimelineError::NotFound => {
+                    // Timeline deletion finished after call to clone above but before call
+                    // to `DeleteTimelineFlow::run` and removed timeline from the map.
+                    continue;
+                }
+                DeleteTimelineError::AlreadyInProgress(guard) => {
+                    already_running_deletions.push((guard, timeline_id));
+                    continue;
+                }
+                e => return Err(DeleteTenantError::Timeline(e)),
+            }
+        }
+    }
+
+    Ok(already_running_deletions)
+}
+
+async fn ensure_timelines_dir_empty(timelines_path: &Path) -> Result<(), DeleteTenantError> {
+    // Assert timelines dir is empty.
+    if !fs_ext::is_directory_empty(timelines_path).await? {
+        // Display first 10 items in directory
+        let list = &fs_ext::list_dir(timelines_path).await.context("list_dir")?[..10];
+        return Err(DeleteTenantError::Other(anyhow::anyhow!(
+            "Timelines directory is not empty after all timelines deletion: {list:?}"
+        )));
+    }
+
+    Ok(())
+}
+
+async fn remove_tenant_remote_delete_mark(
+    conf: &PageServerConf,
+    remote_storage: Option<&GenericRemoteStorage>,
+    tenant_id: &TenantId,
+) -> Result<(), DeleteTenantError> {
+    if let Some(remote_storage) = remote_storage {
+        let path = remote_tenant_delete_mark_path(conf, tenant_id)?;
+        backoff::retry(
+            || async { remote_storage.delete(&path).await },
+            |_e| false,
+            FAILED_UPLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "remove_tenant_remote_delete_mark",
+            // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+        )
+        .await
+        .context("remove_tenant_remote_delete_mark")?;
+    }
+    Ok(())
+}
+
+// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
+async fn cleanup_remaining_fs_traces(
+    conf: &PageServerConf,
+    tenant_id: &TenantId,
+) -> Result<(), DeleteTenantError> {
+    let rm = |p: PathBuf, is_dir: bool| async move {
+        if is_dir {
+            tokio::fs::remove_dir(&p).await
+        } else {
+            tokio::fs::remove_file(&p).await
+        }
+        .or_else(fs_ext::ignore_not_found)
+        .with_context(|| {
+            let to_display = p.display();
+            format!("failed to delete {to_display}")
+        })
+    };
+
+    rm(conf.tenant_config_path(tenant_id), false).await?;
+
+    fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: tenant-delete-before-remove-timelines-dir"
+        ))?
+    });
+
+    rm(conf.timelines_path(tenant_id), true).await?;
+
+    fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: tenant-delete-before-remove-deleted-mark"
+        ))?
+    });
+
+    // Make sure previous deletions are ordered before mark removal.
+    // Otherwise there is no guarantee that they reach the disk before mark deletion.
+    // So its possible for mark to reach disk first and for other deletions
+    // to be reordered later and thus missed if a crash occurs.
+    // Note that we dont need to sync after mark file is removed
+    // because we can tolerate the case when mark file reappears on startup.
+    let tenant_path = &conf.tenant_path(tenant_id);
+    if tenant_path.exists() {
+        crashsafe::fsync_async(&conf.tenant_path(tenant_id))
+            .await
+            .context("fsync_pre_mark_remove")?;
+    }
+
+    rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?;
+
+    fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: tenant-delete-before-remove-tenant-dir"
+        ))?
+    });
+
+    rm(conf.tenant_path(tenant_id), true).await?;
+
+    Ok(())
+}
+
+pub(crate) async fn remote_delete_mark_exists(
+    conf: &PageServerConf,
+    tenant_id: &TenantId,
+    remote_storage: &GenericRemoteStorage,
+) -> anyhow::Result<bool> {
+    // If remote storage is there we rely on it
+    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id).context("path")?;
+
+    let result = backoff::retry(
+        || async { remote_storage.download(&remote_mark_path).await },
+        |e| matches!(e, DownloadError::NotFound),
+        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
+        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
+        "fetch_tenant_deletion_mark",
+        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+        backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+    )
+    .await;
+
+    match result {
+        Ok(_) => Ok(true),
+        Err(DownloadError::NotFound) => Ok(false),
+        Err(e) => Err(anyhow::anyhow!(e)).context("remote_delete_mark_exists")?,
+    }
+}
+
+/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
+/// and deletes its data from both disk and s3.
+/// The sequence of steps:
+/// 1. Upload remote deletion mark.
+/// 2. Create local mark file.
+/// 3. Shutdown tasks
+/// 4. Run ordered timeline deletions
+/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested
+/// 6. Remove remote mark
+/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
+/// It is resumable from any step in case a crash/restart occurs.
+/// There are three entrypoints to the process:
+/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
+/// 2. [`DeleteTenantFlow::resume_from_load`] is called during restarts when local or remote deletion marks are still there.
+/// 3. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
+///  Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
+#[derive(Default)]
+pub enum DeleteTenantFlow {
+    #[default]
+    NotStarted,
+    InProgress,
+    Finished,
+}
+
+impl DeleteTenantFlow {
+    // These steps are run in the context of management api request handler.
+    // Long running steps are continued to run in the background.
+    // NB: If this fails half-way through, and is retried, the retry will go through
+    // all the same steps again. Make sure the code here is idempotent, and don't
+    // error out if some of the shutdown tasks have already been completed!
+    // NOTE: static needed for background part.
+    // We assume that calling code sets up the span with tenant_id.
+    #[instrument(skip_all)]
+    pub(crate) async fn run(
+        conf: &'static PageServerConf,
+        remote_storage: Option<GenericRemoteStorage>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenant_id: TenantId,
+    ) -> Result<(), DeleteTenantError> {
+        span::debug_assert_current_span_has_tenant_id();
+
+        let (tenant, mut guard) = Self::prepare(tenants, tenant_id).await?;
+
+        if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
+            tenant.set_broken(format!("{e:#}")).await;
+            return Err(e);
+        }
+
+        Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
+
+        Ok(())
+    }
+
+    // Helper function needed to be able to match once on returned error and transition tenant into broken state.
+    // This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown
+    // will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
+    // So the solution is to set tenant state to broken.
+    async fn run_inner(
+        guard: &mut OwnedMutexGuard<Self>,
+        conf: &'static PageServerConf,
+        remote_storage: Option<&GenericRemoteStorage>,
+        tenant: &Tenant,
+    ) -> Result<(), DeleteTenantError> {
+        guard.mark_in_progress()?;
+
+        fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-create-remote-mark"
+            ))?
+        });
+
+        // IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend.
+        // Though sounds scary, different mark name?
+        // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
+        if let Some(remote_storage) = &remote_storage {
+            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_id)
+                .await
+                .context("remote_mark")?
+        }
+
+        fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-create-local-mark"
+            ))?
+        });
+
+        create_local_delete_mark(conf, &tenant.tenant_id)
+            .await
+            .context("local delete mark")?;
+
+        fail::fail_point!("tenant-delete-before-background", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-background"
+            ))?
+        });
+
+        Ok(())
+    }
+
+    fn mark_in_progress(&mut self) -> anyhow::Result<()> {
+        match self {
+            Self::Finished => anyhow::bail!("Bug. Is in finished state"),
+            Self::InProgress { .. } => { /* We're in a retry */ }
+            Self::NotStarted => { /* Fresh start */ }
+        }
+
+        *self = Self::InProgress;
+
+        Ok(())
+    }
+
+    pub async fn should_resume_deletion(
+        conf: &'static PageServerConf,
+        remote_storage: Option<&GenericRemoteStorage>,
+        tenant: &Tenant,
+    ) -> Result<Option<DeletionGuard>, DeleteTenantError> {
+        let acquire = |t: &Tenant| {
+            Some(
+                Arc::clone(&t.delete_progress)
+                    .try_lock_owned()
+                    .expect("we're the only owner during init"),
+            )
+        };
+
+        let tenant_id = tenant.tenant_id;
+        // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
+        if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
+            return Ok(acquire(tenant));
+        }
+
+        let remote_storage = match remote_storage {
+            Some(remote_storage) => remote_storage,
+            None => return Ok(None),
+        };
+
+        if remote_delete_mark_exists(conf, &tenant_id, remote_storage).await? {
+            Ok(acquire(tenant))
+        } else {
+            Ok(None)
+        }
+    }
+
+    pub(crate) async fn resume_from_load(
+        guard: DeletionGuard,
+        tenant: &Arc<Tenant>,
+        init_order: Option<&InitializationOrder>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        ctx: &RequestContext,
+    ) -> Result<(), DeleteTenantError> {
+        let (_, progress) = completion::channel();
+
+        tenant
+            .set_stopping(progress, true, false)
+            .await
+            .expect("cant be stopping or broken");
+
+        // Do not consume valuable resources during the load phase, continue deletion once init phase is complete.
+        let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
+        if let Some(background) = background_jobs_can_start {
+            info!("waiting for backgound jobs barrier");
+            background.clone().wait().await;
+            info!("ready for backgound jobs barrier");
+        }
+
+        // Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir)
+        let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id);
+        if timelines_path.exists() {
+            tenant.load(init_order, ctx).await.context("load")?;
+        }
+
+        Self::background(
+            guard,
+            tenant.conf,
+            tenant.remote_storage.clone(),
+            tenants,
+            tenant,
+        )
+        .await
+    }
+
+    pub(crate) async fn resume_from_attach(
+        guard: DeletionGuard,
+        tenant: &Arc<Tenant>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        ctx: &RequestContext,
+    ) -> Result<(), DeleteTenantError> {
+        let (_, progress) = completion::channel();
+
+        tenant
+            .set_stopping(progress, false, true)
+            .await
+            .expect("cant be stopping or broken");
+
+        tenant.attach(ctx).await.context("attach")?;
+
+        Self::background(
+            guard,
+            tenant.conf,
+            tenant.remote_storage.clone(),
+            tenants,
+            tenant,
+        )
+        .await
+    }
+
+    async fn prepare(
+        tenants: &tokio::sync::RwLock<TenantsMap>,
+        tenant_id: TenantId,
+    ) -> Result<(Arc<Tenant>, tokio::sync::OwnedMutexGuard<Self>), DeleteTenantError> {
+        let m = tenants.read().await;
+
+        let tenant = m
+            .get(&tenant_id)
+            .ok_or(GetTenantError::NotFound(tenant_id))?;
+
+        // FIXME: unsure about active only. Our init jobs may not be cancellable properly,
+        // so at least for now allow deletions only for active tenants. TODO recheck
+        // Broken and Stopping is needed for retries.
+        if !matches!(
+            tenant.current_state(),
+            TenantState::Active | TenantState::Broken { .. }
+        ) {
+            return Err(DeleteTenantError::InvalidState(tenant.current_state()));
+        }
+
+        let guard = Arc::clone(&tenant.delete_progress)
+            .try_lock_owned()
+            .map_err(|_| DeleteTenantError::AlreadyInProgress)?;
+
+        fail::fail_point!("tenant-delete-before-shutdown", |_| {
+            Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
+        });
+
+        // make pageserver shutdown not to wait for our completion
+        let (_, progress) = completion::channel();
+
+        // It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent.
+        // i e it is an error to do:
+        // tenant.set_stopping
+        // tenant.shutdown
+        // Its also bad that we're holding tenants.read here.
+        // TODO relax set_stopping to be idempotent?
+        if tenant.shutdown(progress, false).await.is_err() {
+            return Err(DeleteTenantError::Other(anyhow::anyhow!(
+                "tenant shutdown is already in progress"
+            )));
+        }
+
+        Ok((Arc::clone(tenant), guard))
+    }
+
+    fn schedule_background(
+        guard: OwnedMutexGuard<Self>,
+        conf: &'static PageServerConf,
+        remote_storage: Option<GenericRemoteStorage>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenant: Arc<Tenant>,
+    ) {
+        let tenant_id = tenant.tenant_id;
+
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::TimelineDeletionWorker,
+            Some(tenant_id),
+            None,
+            "tenant_delete",
+            false,
+            async move {
+                if let Err(err) =
+                    Self::background(guard, conf, remote_storage, tenants, &tenant).await
+                {
+                    error!("Error: {err:#}");
+                    tenant.set_broken(format!("{err:#}")).await;
+                };
+                Ok(())
+            }
+            .instrument({
+                let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_id);
+                span.follows_from(Span::current());
+                span
+            }),
+        );
+    }
+
+    async fn background(
+        mut guard: OwnedMutexGuard<Self>,
+        conf: &PageServerConf,
+        remote_storage: Option<GenericRemoteStorage>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenant: &Arc<Tenant>,
+    ) -> Result<(), DeleteTenantError> {
+        // Tree sort timelines, schedule delete for them. Mention retries from the console side.
+        // Note that if deletion fails we dont mark timelines as broken,
+        // the whole tenant will become broken as by `Self::schedule_background` logic
+        let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
+            .await
+            .context("schedule_ordered_timeline_deletions")?;
+
+        fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-polling-ongoing-deletions"
+            ))?
+        });
+
+        // Wait for deletions that were already running at the moment when tenant deletion was requested.
+        // When we can lock deletion guard it means that corresponding timeline deletion finished.
+        for (guard, timeline_id) in already_running_timeline_deletions {
+            let flow = guard.lock().await;
+            if !flow.is_finished() {
+                return Err(DeleteTenantError::Other(anyhow::anyhow!(
+                    "already running timeline deletion failed: {timeline_id}"
+                )));
+            }
+        }
+
+        let timelines_path = conf.timelines_path(&tenant.tenant_id);
+        // May not exist if we fail in cleanup_remaining_fs_traces after removing it
+        if timelines_path.exists() {
+            // sanity check to guard against layout changes
+            ensure_timelines_dir_empty(&timelines_path)
+                .await
+                .context("timelines dir not empty")?;
+        }
+
+        remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_id).await?;
+
+        fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-cleanup-remaining-fs-traces"
+            ))?
+        });
+
+        cleanup_remaining_fs_traces(conf, &tenant.tenant_id)
+            .await
+            .context("cleanup_remaining_fs_traces")?;
+
+        let mut locked = tenants.write().await;
+        if locked.remove(&tenant.tenant_id).is_none() {
+            warn!("Tenant got removed from tenants map during deletion");
+        };
+
+        *guard = Self::Finished;
+
+        Ok(())
+    }
+}
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -20,6 +20,7 @@
 //!
 use byteorder::{ReadBytesExt, BE};
 use bytes::{BufMut, Bytes, BytesMut};
+use either::Either;
 use hex;
 use std::{cmp::Ordering, io, result};
 use thiserror::Error;
@@ -230,14 +231,15 @@ where
    ///
    /// Read the value for given key. Returns the value, or None if it doesn't exist.
    ///
-    pub fn get(&self, search_key: &[u8; L]) -> Result<Option<u64>> {
+    pub async fn get(&self, search_key: &[u8; L]) -> Result<Option<u64>> {
        let mut result: Option<u64> = None;
        self.visit(search_key, VisitDirection::Forwards, |key, value| {
            if key == search_key {
                result = Some(value);
            }
            false
-        })?;
+        })
+        .await?;
        Ok(result)
    }

@@ -246,7 +248,7 @@ where
    /// will be called for every key >= 'search_key' (or <= 'search_key', if scanning
    /// backwards)
    ///
-    pub fn visit<V>(
+    pub async fn visit<V>(
        &self,
        search_key: &[u8; L],
        dir: VisitDirection,
@@ -255,117 +257,78 @@ where
    where
        V: FnMut(&[u8], u64) -> bool,
    {
-        self.search_recurse(self.root_blk, search_key, dir, &mut visitor)
-    }
+        let mut stack = Vec::new();
+        stack.push((self.root_blk, None));
+        let block_cursor = self.reader.block_cursor();
+        while let Some((node_blknum, opt_iter)) = stack.pop() {
+            // Locate the node.
+            let node_buf = block_cursor.read_blk(self.start_blk + node_blknum)?;

-    fn search_recurse<V>(
-        &self,
-        node_blknum: u32,
-        search_key: &[u8; L],
-        dir: VisitDirection,
-        visitor: &mut V,
-    ) -> Result<bool>
-    where
-        V: FnMut(&[u8], u64) -> bool,
-    {
-        // Locate the node.
-        let blk = self.reader.read_blk(self.start_blk + node_blknum)?;
+            let node = OnDiskNode::deparse(node_buf.as_ref())?;
+            let prefix_len = node.prefix_len as usize;
+            let suffix_len = node.suffix_len as usize;

-        // Search all entries on this node
-        self.search_node(blk.as_ref(), search_key, dir, visitor)
-    }
+            assert!(node.num_children > 0);

-    fn search_node<V>(
-        &self,
-        node_buf: &[u8],
-        search_key: &[u8; L],
-        dir: VisitDirection,
-        visitor: &mut V,
-    ) -> Result<bool>
-    where
-        V: FnMut(&[u8], u64) -> bool,
-    {
-        let node = OnDiskNode::deparse(node_buf)?;
-        let prefix_len = node.prefix_len as usize;
-        let suffix_len = node.suffix_len as usize;
+            let mut keybuf = Vec::new();
+            keybuf.extend(node.prefix);
+            keybuf.resize(prefix_len + suffix_len, 0);

-        assert!(node.num_children > 0);
-
-        let mut keybuf = Vec::new();
-        keybuf.extend(node.prefix);
-        keybuf.resize(prefix_len + suffix_len, 0);
-
-        if dir == VisitDirection::Forwards {
-            // Locate the first match
-            let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
-                Ok(idx) => idx,
-                Err(idx) => {
-                    if node.level == 0 {
-                        // Imagine that the node contains the following keys:
-                        //
-                        // 1
-                        // 3  <-- idx
-                        // 5
-                        //
-                        // If the search key is '2' and there is exact match,
-                        // the binary search would return the index of key
-                        // '3'. That's cool, '3' is the first key to return.
+            let mut iter = if let Some(iter) = opt_iter {
+                iter
+            } else if dir == VisitDirection::Forwards {
+                // Locate the first match
+                let idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
+                    Ok(idx) => idx,
+                    Err(idx) => {
+                        if node.level == 0 {
+                            // Imagine that the node contains the following keys:
+                            //
+                            // 1
+                            // 3  <-- idx
+                            // 5
+                            //
+                            // If the search key is '2' and there is exact match,
+                            // the binary search would return the index of key
+                            // '3'. That's cool, '3' is the first key to return.
+                            idx
+                        } else {
+                            // This is an internal page, so each key represents a lower
+                            // bound for what's in the child page. If there is no exact
+                            // match, we have to return the *previous* entry.
+                            //
+                            // 1  <-- return this
+                            // 3  <-- idx
+                            // 5
+                            idx.saturating_sub(1)
+                        }
+                    }
+                };
+                Either::Left(idx..node.num_children.into())
+            } else {
+                let idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
+                    Ok(idx) => {
+                        // Exact match. That's the first entry to return, and walk
+                        // backwards from there.
                        idx
-                    } else {
-                        // This is an internal page, so each key represents a lower
-                        // bound for what's in the child page. If there is no exact
-                        // match, we have to return the *previous* entry.
-                        //
-                        // 1  <-- return this
-                        // 3  <-- idx
-                        // 5
-                        idx.saturating_sub(1)
                    }
-                }
-            };
-            // idx points to the first match now. Keep going from there
-            let mut key_off = idx * suffix_len;
-            while idx < node.num_children as usize {
-                let suffix = &node.keys[key_off..key_off + suffix_len];
-                keybuf[prefix_len..].copy_from_slice(suffix);
-                let value = node.value(idx);
-                #[allow(clippy::collapsible_if)]
-                if node.level == 0 {
-                    // leaf
-                    if !visitor(&keybuf, value.to_u64()) {
-                        return Ok(false);
+                    Err(idx) => {
+                        // No exact match. The binary search returned the index of the
+                        // first key that's > search_key. Back off by one, and walk
+                        // backwards from there.
+                        if let Some(idx) = idx.checked_sub(1) {
+                            idx
+                        } else {
+                            return Ok(false);
+                        }
                    }
-                } else {
-                    #[allow(clippy::collapsible_if)]
-                    if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? {
-                        return Ok(false);
-                    }
-                }
-                idx += 1;
-                key_off += suffix_len;
-            }
-        } else {
-            let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
-                Ok(idx) => {
-                    // Exact match. That's the first entry to return, and walk
-                    // backwards from there. (The loop below starts from 'idx -
-                    // 1', so add one here to compensate.)
-                    idx + 1
-                }
-                Err(idx) => {
-                    // No exact match. The binary search returned the index of the
-                    // first key that's > search_key. Back off by one, and walk
-                    // backwards from there. (The loop below starts from idx - 1,
-                    // so we don't need to subtract one here)
-                    idx
-                }
+                };
+                Either::Right((0..=idx).rev())
            };

-            // idx points to the first match + 1 now. Keep going from there.
-            let mut key_off = idx * suffix_len;
-            while idx > 0 {
-                idx -= 1;
-                key_off -= suffix_len;
+            // idx points to the first match now. Keep going from there
+            while let Some(idx) = iter.next() {
+                let key_off = idx * suffix_len;
                let suffix = &node.keys[key_off..key_off + suffix_len];
                keybuf[prefix_len..].copy_from_slice(suffix);
                let value = node.value(idx);
@@ -376,12 +339,8 @@ where
                        return Ok(false);
                    }
                } else {
-                    #[allow(clippy::collapsible_if)]
-                    if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? {
-                        return Ok(false);
-                    }
-                }
-                if idx == 0 {
+                    stack.push((node_blknum, Some(iter)));
+                    stack.push((value.to_blknum(), None));
                    break;
                }
            }
@@ -390,39 +349,44 @@ where
    }

    #[allow(dead_code)]
-    pub fn dump(&self) -> Result<()> {
-        self.dump_recurse(self.root_blk, &[], 0)
-    }
+    pub async fn dump(&self) -> Result<()> {
+        let mut stack = Vec::new();

-    fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> Result<()> {
-        let blk = self.reader.read_blk(self.start_blk + blknum)?;
-        let buf: &[u8] = blk.as_ref();
+        stack.push((self.root_blk, String::new(), 0, 0, 0));

-        let node = OnDiskNode::<L>::deparse(buf)?;
+        let block_cursor = self.reader.block_cursor();

-        print!("{:indent$}", "", indent = depth * 2);
-        println!(
-            "blk #{}: path {}: prefix {}, suffix_len {}",
-            blknum,
-            hex::encode(path),
-            hex::encode(node.prefix),
-            node.suffix_len
-        );
+        while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
+            let blk = block_cursor.read_blk(self.start_blk + blknum)?;
+            let buf: &[u8] = blk.as_ref();
+            let node = OnDiskNode::<L>::deparse(buf)?;

-        let mut idx = 0;
-        let mut key_off = 0;
-        while idx < node.num_children {
+            if child_idx == 0 {
+                print!("{:indent$}", "", indent = depth * 2);
+                let path_prefix = stack
+                    .iter()
+                    .map(|(_blknum, path, ..)| path.as_str())
+                    .collect::<String>();
+                println!(
+                    "blk #{blknum}: path {path_prefix}{path}: prefix {}, suffix_len {}",
+                    hex::encode(node.prefix),
+                    node.suffix_len
+                );
+            }
+
+            if child_idx + 1 < node.num_children {
+                let key_off = key_off + node.suffix_len as usize;
+                stack.push((blknum, path.clone(), depth, child_idx + 1, key_off));
+            }
            let key = &node.keys[key_off..key_off + node.suffix_len as usize];
-            let val = node.value(idx as usize);
+            let val = node.value(child_idx as usize);
+
            print!("{:indent$}", "", indent = depth * 2 + 2);
            println!("{}: {}", hex::encode(key), hex::encode(val.0));

            if node.level > 0 {
-                let child_path = [path, node.prefix].concat();
-                self.dump_recurse(val.to_blknum(), &child_path, depth + 1)?;
+                stack.push((val.to_blknum(), hex::encode(node.prefix), depth + 1, 0, 0));
            }
-            idx += 1;
-            key_off += node.suffix_len as usize;
        }
        Ok(())
    }
@@ -722,28 +686,30 @@ impl<const L: usize> BuildNode<L> {
 }

 #[cfg(test)]
-mod tests {
+pub(crate) mod tests {
    use super::*;
+    use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};
    use rand::Rng;
    use std::collections::BTreeMap;
    use std::sync::atomic::{AtomicUsize, Ordering};

    #[derive(Clone, Default)]
-    struct TestDisk {
+    pub(crate) struct TestDisk {
        blocks: Vec<Bytes>,
    }
    impl TestDisk {
        fn new() -> Self {
            Self::default()
        }
-    }
-    impl BlockReader for TestDisk {
-        type BlockLease = std::rc::Rc<[u8; PAGE_SZ]>;
-
-        fn read_blk(&self, blknum: u32) -> io::Result<Self::BlockLease> {
+        pub(crate) fn read_blk(&self, blknum: u32) -> io::Result<BlockLease> {
            let mut buf = [0u8; PAGE_SZ];
            buf.copy_from_slice(&self.blocks[blknum as usize]);
-            Ok(std::rc::Rc::new(buf))
+            Ok(std::rc::Rc::new(buf).into())
+        }
+    }
+    impl BlockReader for TestDisk {
+        fn block_cursor(&self) -> BlockCursor<'_> {
+            BlockCursor::new(BlockReaderRef::TestDisk(self))
        }
    }
    impl BlockWriter for &mut TestDisk {
@@ -754,8 +720,8 @@ mod tests {
        }
    }

-    #[test]
-    fn basic() -> Result<()> {
+    #[tokio::test]
+    async fn basic() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);

@@ -775,16 +741,16 @@ mod tests {

        let reader = DiskBtreeReader::new(0, root_offset, disk);

-        reader.dump()?;
+        reader.dump().await?;

        // Test the `get` function on all the keys.
        for (key, val) in all_data.iter() {
-            assert_eq!(reader.get(key)?, Some(*val));
+            assert_eq!(reader.get(key).await?, Some(*val));
        }
        // And on some keys that don't exist
-        assert_eq!(reader.get(b"aaaaaa")?, None);
-        assert_eq!(reader.get(b"zzzzzz")?, None);
-        assert_eq!(reader.get(b"xaaabx")?, None);
+        assert_eq!(reader.get(b"aaaaaa").await?, None);
+        assert_eq!(reader.get(b"zzzzzz").await?, None);
+        assert_eq!(reader.get(b"xaaabx").await?, None);

        // Test search with `visit` function
        let search_key = b"xabaaa";
@@ -795,10 +761,12 @@ mod tests {
            .collect();

        let mut data = Vec::new();
-        reader.visit(search_key, VisitDirection::Forwards, |key, value| {
-            data.push((key.to_vec(), value));
-            true
-        })?;
+        reader
+            .visit(search_key, VisitDirection::Forwards, |key, value| {
+                data.push((key.to_vec(), value));
+                true
+            })
+            .await?;
        assert_eq!(data, expected);

        // Test a backwards scan
@@ -809,16 +777,20 @@ mod tests {
            .collect();
        expected.reverse();
        let mut data = Vec::new();
-        reader.visit(search_key, VisitDirection::Backwards, |key, value| {
-            data.push((key.to_vec(), value));
-            true
-        })?;
+        reader
+            .visit(search_key, VisitDirection::Backwards, |key, value| {
+                data.push((key.to_vec(), value));
+                true
+            })
+            .await?;
        assert_eq!(data, expected);

        // Backward scan where nothing matches
-        reader.visit(b"aaaaaa", VisitDirection::Backwards, |key, value| {
-            panic!("found unexpected key {}: {}", hex::encode(key), value);
-        })?;
+        reader
+            .visit(b"aaaaaa", VisitDirection::Backwards, |key, value| {
+                panic!("found unexpected key {}: {}", hex::encode(key), value);
+            })
+            .await?;

        // Full scan
        let expected: Vec<(Vec<u8>, u64)> = all_data
@@ -826,17 +798,19 @@ mod tests {
            .map(|(key, value)| (key.to_vec(), *value))
            .collect();
        let mut data = Vec::new();
-        reader.visit(&[0u8; 6], VisitDirection::Forwards, |key, value| {
-            data.push((key.to_vec(), value));
-            true
-        })?;
+        reader
+            .visit(&[0u8; 6], VisitDirection::Forwards, |key, value| {
+                data.push((key.to_vec(), value));
+                true
+            })
+            .await?;
        assert_eq!(data, expected);

        Ok(())
    }

-    #[test]
-    fn lots_of_keys() -> Result<()> {
+    #[tokio::test]
+    async fn lots_of_keys() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);

@@ -856,7 +830,7 @@ mod tests {

        let reader = DiskBtreeReader::new(0, root_offset, disk);

-        reader.dump()?;
+        reader.dump().await?;

        use std::sync::Mutex;

@@ -877,13 +851,15 @@ mod tests {
        for search_key_int in 0..(NUM_KEYS * 2 + 10) {
            let search_key = u64::to_be_bytes(search_key_int);
            assert_eq!(
-                reader.get(&search_key)?,
+                reader.get(&search_key).await?,
                all_data.get(&search_key_int).cloned()
            );

            // Test a forward scan starting with this key
            result.lock().unwrap().clear();
-            reader.visit(&search_key, VisitDirection::Forwards, take_ten)?;
+            reader
+                .visit(&search_key, VisitDirection::Forwards, take_ten)
+                .await?;
            let expected = all_data
                .range(search_key_int..)
                .take(10)
@@ -893,7 +869,9 @@ mod tests {

            // And a backwards scan
            result.lock().unwrap().clear();
-            reader.visit(&search_key, VisitDirection::Backwards, take_ten)?;
+            reader
+                .visit(&search_key, VisitDirection::Backwards, take_ten)
+                .await?;
            let expected = all_data
                .range(..=search_key_int)
                .rev()
@@ -907,7 +885,9 @@ mod tests {
        let search_key = u64::to_be_bytes(0);
        limit.store(usize::MAX, Ordering::Relaxed);
        result.lock().unwrap().clear();
-        reader.visit(&search_key, VisitDirection::Forwards, take_ten)?;
+        reader
+            .visit(&search_key, VisitDirection::Forwards, take_ten)
+            .await?;
        let expected = all_data
            .iter()
            .map(|(&key, &val)| (key, val))
@@ -918,7 +898,9 @@ mod tests {
        let search_key = u64::to_be_bytes(u64::MAX);
        limit.store(usize::MAX, Ordering::Relaxed);
        result.lock().unwrap().clear();
-        reader.visit(&search_key, VisitDirection::Backwards, take_ten)?;
+        reader
+            .visit(&search_key, VisitDirection::Backwards, take_ten)
+            .await?;
        let expected = all_data
            .iter()
            .rev()
@@ -929,8 +911,8 @@ mod tests {
        Ok(())
    }

-    #[test]
-    fn random_data() -> Result<()> {
+    #[tokio::test]
+    async fn random_data() -> Result<()> {
        // Generate random keys with exponential distribution, to
        // exercise the prefix compression
        const NUM_KEYS: usize = 100000;
@@ -957,19 +939,23 @@ mod tests {
        // Test get() operation on all the keys
        for (&key, &val) in all_data.iter() {
            let search_key = u128::to_be_bytes(key);
-            assert_eq!(reader.get(&search_key)?, Some(val));
+            assert_eq!(reader.get(&search_key).await?, Some(val));
        }

        // Test get() operations on random keys, most of which will not exist
        for _ in 0..100000 {
            let key_int = rand::thread_rng().gen::<u128>();
            let search_key = u128::to_be_bytes(key_int);
-            assert!(reader.get(&search_key)? == all_data.get(&key_int).cloned());
+            assert!(reader.get(&search_key).await? == all_data.get(&key_int).cloned());
        }

        // Test boundary cases
-        assert!(reader.get(&u128::to_be_bytes(u128::MIN))? == all_data.get(&u128::MIN).cloned());
-        assert!(reader.get(&u128::to_be_bytes(u128::MAX))? == all_data.get(&u128::MAX).cloned());
+        assert!(
+            reader.get(&u128::to_be_bytes(u128::MIN)).await? == all_data.get(&u128::MIN).cloned()
+        );
+        assert!(
+            reader.get(&u128::to_be_bytes(u128::MAX)).await? == all_data.get(&u128::MAX).cloned()
+        );

        Ok(())
    }
@@ -994,8 +980,8 @@ mod tests {
    ///
    /// This test contains a particular data set, see disk_btree_test_data.rs
    ///
-    #[test]
-    fn particular_data() -> Result<()> {
+    #[tokio::test]
+    async fn particular_data() -> Result<()> {
        // Build a tree from it
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
@@ -1011,18 +997,20 @@ mod tests {

        // Test get() operation on all the keys
        for (key, val) in disk_btree_test_data::TEST_DATA {
-            assert_eq!(reader.get(&key)?, Some(val));
+            assert_eq!(reader.get(&key).await?, Some(val));
        }

        // Test full scan
        let mut count = 0;
-        reader.visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| {
-            count += 1;
-            true
-        })?;
+        reader
+            .visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| {
+                count += 1;
+                true
+            })
+            .await?;
        assert_eq!(count, disk_btree_test_data::TEST_DATA.len());

-        reader.dump()?;
+        reader.dump().await?;

        Ok(())
    }
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -2,46 +2,30 @@
 //! used to keep in-memory layers spilled on disk.

 use crate::config::PageServerConf;
-use crate::page_cache::{self, ReadBufResult, WriteBufResult, PAGE_SZ};
-use crate::tenant::blob_io::BlobWriter;
-use crate::tenant::block_io::BlockReader;
+use crate::page_cache::{self, PAGE_SZ};
+use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
 use crate::virtual_file::VirtualFile;
-use once_cell::sync::Lazy;
 use std::cmp::min;
-use std::collections::HashMap;
 use std::fs::OpenOptions;
 use std::io::{self, ErrorKind};
 use std::ops::DerefMut;
+use std::os::unix::prelude::FileExt;
 use std::path::PathBuf;
-use std::sync::{Arc, RwLock};
+use std::sync::atomic::AtomicU64;
 use tracing::*;
 use utils::id::{TenantId, TimelineId};

-use std::os::unix::fs::FileExt;
-
-///
-/// This is the global cache of file descriptors (File objects).
-///
-static EPHEMERAL_FILES: Lazy<RwLock<EphemeralFiles>> = Lazy::new(|| {
-    RwLock::new(EphemeralFiles {
-        next_file_id: 1,
-        files: HashMap::new(),
-    })
-});
-
-pub struct EphemeralFiles {
-    next_file_id: u64,
-
-    files: HashMap<u64, Arc<VirtualFile>>,
-}
-
 pub struct EphemeralFile {
-    file_id: u64,
+    page_cache_file_id: page_cache::FileId,
+
    _tenant_id: TenantId,
    _timeline_id: TimelineId,
-    file: Arc<VirtualFile>,
-
-    pub size: u64,
+    file: VirtualFile,
+    len: u64,
+    /// An ephemeral file is append-only.
+    /// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
+    /// The other pages, which can no longer be modified, are accessed through the page cache.
+    mutable_tail: [u8; PAGE_SZ],
 }

 impl EphemeralFile {
@@ -50,71 +34,179 @@ impl EphemeralFile {
        tenant_id: TenantId,
        timeline_id: TimelineId,
    ) -> Result<EphemeralFile, io::Error> {
-        let mut l = EPHEMERAL_FILES.write().unwrap();
-        let file_id = l.next_file_id;
-        l.next_file_id += 1;
+        static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
+        let filename_disambiguator =
+            NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed);

        let filename = conf
            .timeline_path(&tenant_id, &timeline_id)
-            .join(PathBuf::from(format!("ephemeral-{}", file_id)));
+            .join(PathBuf::from(format!("ephemeral-{filename_disambiguator}")));

        let file = VirtualFile::open_with_options(
            &filename,
            OpenOptions::new().read(true).write(true).create(true),
        )?;
-        let file_rc = Arc::new(file);
-        l.files.insert(file_id, file_rc.clone());

        Ok(EphemeralFile {
-            file_id,
+            page_cache_file_id: page_cache::next_file_id(),
            _tenant_id: tenant_id,
            _timeline_id: timeline_id,
-            file: file_rc,
-            size: 0,
+            file,
+            len: 0,
+            mutable_tail: [0u8; PAGE_SZ],
        })
    }

-    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), io::Error> {
-        let mut off = 0;
-        while off < PAGE_SZ {
-            let n = self
-                .file
-                .read_at(&mut buf[off..], blkno as u64 * PAGE_SZ as u64 + off as u64)?;
-
-            if n == 0 {
-                // Reached EOF. Fill the rest of the buffer with zeros.
-                const ZERO_BUF: [u8; PAGE_SZ] = [0u8; PAGE_SZ];
-
-                buf[off..].copy_from_slice(&ZERO_BUF[off..]);
-                break;
-            }
-
-            off += n;
-        }
-        Ok(())
+    pub(crate) fn len(&self) -> u64 {
+        self.len
    }

-    fn get_buf_for_write(&self, blkno: u32) -> Result<page_cache::PageWriteGuard, io::Error> {
-        // Look up the right page
-        let cache = page_cache::get();
-        let mut write_guard = match cache
-            .write_ephemeral_buf(self.file_id, blkno)
-            .map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))?
-        {
-            WriteBufResult::Found(guard) => guard,
-            WriteBufResult::NotFound(mut guard) => {
-                // Read the page from disk into the buffer
-                // TODO: if we're overwriting the whole page, no need to read it in first
-                self.fill_buffer(guard.deref_mut(), blkno)?;
-                guard.mark_valid();
+    pub(crate) fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
+        let flushed_blknums = 0..self.len / PAGE_SZ as u64;
+        if flushed_blknums.contains(&(blknum as u64)) {
+            let cache = page_cache::get();
+            loop {
+                match cache
+                    .read_immutable_buf(self.page_cache_file_id, blknum)
+                    .map_err(|e| {
+                        std::io::Error::new(
+                            std::io::ErrorKind::Other,
+                            // order path before error because error is anyhow::Error => might have many contexts
+                            format!(
+                                "ephemeral file: read immutable page #{}: {}: {:#}",
+                                blknum,
+                                self.file.path.display(),
+                                e,
+                            ),
+                        )
+                    })? {
+                    page_cache::ReadBufResult::Found(guard) => {
+                        return Ok(BlockLease::PageReadGuard(guard))
+                    }
+                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
+                        let buf: &mut [u8] = write_guard.deref_mut();
+                        debug_assert_eq!(buf.len(), PAGE_SZ);
+                        self.file
+                            .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)?;
+                        write_guard.mark_valid();

-                // And then fall through to modify it.
-                guard
+                        // Swap for read lock
+                        continue;
+                    }
+                };
            }
-        };
-        write_guard.mark_dirty();
+        } else {
+            debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
+            Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
+        }
+    }

-        Ok(write_guard)
+    pub(crate) async fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
+        struct Writer<'a> {
+            ephemeral_file: &'a mut EphemeralFile,
+            /// The block to which the next [`push_bytes`] will write.
+            blknum: u32,
+            /// The offset inside the block identified by [`blknum`] to which [`push_bytes`] will write.
+            off: usize,
+        }
+        impl<'a> Writer<'a> {
+            fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
+                Ok(Writer {
+                    blknum: (ephemeral_file.len / PAGE_SZ as u64) as u32,
+                    off: (ephemeral_file.len % PAGE_SZ as u64) as usize,
+                    ephemeral_file,
+                })
+            }
+            #[inline(always)]
+            async fn push_bytes(&mut self, src: &[u8]) -> Result<(), io::Error> {
+                let mut src_remaining = src;
+                while !src_remaining.is_empty() {
+                    let dst_remaining = &mut self.ephemeral_file.mutable_tail[self.off..];
+                    let n = min(dst_remaining.len(), src_remaining.len());
+                    dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
+                    self.off += n;
+                    src_remaining = &src_remaining[n..];
+                    if self.off == PAGE_SZ {
+                        match self.ephemeral_file.file.write_all_at(
+                            &self.ephemeral_file.mutable_tail,
+                            self.blknum as u64 * PAGE_SZ as u64,
+                        ) {
+                            Ok(_) => {
+                                // Pre-warm the page cache with what we just wrote.
+                                // This isn't necessary for coherency/correctness, but it's how we've always done it.
+                                let cache = page_cache::get();
+                                match cache.read_immutable_buf(
+                                    self.ephemeral_file.page_cache_file_id,
+                                    self.blknum,
+                                ) {
+                                    Ok(page_cache::ReadBufResult::Found(_guard)) => {
+                                        // This function takes &mut self, so, it shouldn't be possible to reach this point.
+                                        unreachable!("we just wrote blknum {} and this function takes &mut self, so, no concurrent read_blk is possible", self.blknum);
+                                    }
+                                    Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => {
+                                        let buf: &mut [u8] = write_guard.deref_mut();
+                                        debug_assert_eq!(buf.len(), PAGE_SZ);
+                                        buf.copy_from_slice(&self.ephemeral_file.mutable_tail);
+                                        write_guard.mark_valid();
+                                        // pre-warm successful
+                                    }
+                                    Err(e) => {
+                                        error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
+                                        // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
+                                    }
+                                }
+                                // Zero the buffer for re-use.
+                                // Zeroing is critical for correcntess because the write_blob code below
+                                // and similarly read_blk expect zeroed pages.
+                                self.ephemeral_file.mutable_tail.fill(0);
+                                // This block is done, move to next one.
+                                self.blknum += 1;
+                                self.off = 0;
+                            }
+                            Err(e) => {
+                                return Err(std::io::Error::new(
+                                    ErrorKind::Other,
+                                    // order error before path because path is long and error is short
+                                    format!(
+                                        "ephemeral_file: write_blob: write-back full tail blk #{}: {:#}: {}",
+                                        self.blknum,
+                                        e,
+                                        self.ephemeral_file.file.path.display(),
+                                    ),
+                                ));
+                            }
+                        }
+                    }
+                }
+                Ok(())
+            }
+        }
+
+        let pos = self.len;
+        let mut writer = Writer::new(self)?;
+
+        // Write the length field
+        if srcbuf.len() < 0x80 {
+            // short one-byte length header
+            let len_buf = [srcbuf.len() as u8];
+            writer.push_bytes(&len_buf).await?;
+        } else {
+            let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
+            len_buf[0] |= 0x80;
+            writer.push_bytes(&len_buf).await?;
+        }
+
+        // Write the payload
+        writer.push_bytes(srcbuf).await?;
+
+        if srcbuf.len() < 0x80 {
+            self.len += 1;
+        } else {
+            self.len += 4;
+        }
+        self.len += srcbuf.len() as u64;
+
+        Ok(pos)
    }
 }

@@ -127,210 +219,41 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
    }
 }

-impl FileExt for EphemeralFile {
-    fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result<usize, io::Error> {
-        // Look up the right page
-        let blkno = (offset / PAGE_SZ as u64) as u32;
-        let off = offset as usize % PAGE_SZ;
-        let len = min(PAGE_SZ - off, dstbuf.len());
-
-        let read_guard;
-        let mut write_guard;
-
-        let cache = page_cache::get();
-        let buf = match cache
-            .read_ephemeral_buf(self.file_id, blkno)
-            .map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))?
-        {
-            ReadBufResult::Found(guard) => {
-                read_guard = guard;
-                read_guard.as_ref()
-            }
-            ReadBufResult::NotFound(guard) => {
-                // Read the page from disk into the buffer
-                write_guard = guard;
-                self.fill_buffer(write_guard.deref_mut(), blkno)?;
-                write_guard.mark_valid();
-
-                // And then fall through to read the requested slice from the
-                // buffer.
-                write_guard.as_ref()
-            }
-        };
-
-        dstbuf[0..len].copy_from_slice(&buf[off..(off + len)]);
-        Ok(len)
-    }
-
-    fn write_at(&self, srcbuf: &[u8], offset: u64) -> Result<usize, io::Error> {
-        // Look up the right page
-        let blkno = (offset / PAGE_SZ as u64) as u32;
-        let off = offset as usize % PAGE_SZ;
-        let len = min(PAGE_SZ - off, srcbuf.len());
-
-        let mut write_guard;
-        let cache = page_cache::get();
-        let buf = match cache
-            .write_ephemeral_buf(self.file_id, blkno)
-            .map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))?
-        {
-            WriteBufResult::Found(guard) => {
-                write_guard = guard;
-                write_guard.deref_mut()
-            }
-            WriteBufResult::NotFound(guard) => {
-                // Read the page from disk into the buffer
-                // TODO: if we're overwriting the whole page, no need to read it in first
-                write_guard = guard;
-                self.fill_buffer(write_guard.deref_mut(), blkno)?;
-                write_guard.mark_valid();
-
-                // And then fall through to modify it.
-                write_guard.deref_mut()
-            }
-        };
-
-        buf[off..(off + len)].copy_from_slice(&srcbuf[0..len]);
-        write_guard.mark_dirty();
-        Ok(len)
-    }
-}
-
-impl BlobWriter for EphemeralFile {
-    fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
-        let pos = self.size;
-
-        let mut blknum = (self.size / PAGE_SZ as u64) as u32;
-        let mut off = (pos % PAGE_SZ as u64) as usize;
-
-        let mut buf = self.get_buf_for_write(blknum)?;
-
-        // Write the length field
-        if srcbuf.len() < 0x80 {
-            buf[off] = srcbuf.len() as u8;
-            off += 1;
-        } else {
-            let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
-            len_buf[0] |= 0x80;
-            let thislen = PAGE_SZ - off;
-            if thislen < 4 {
-                // it needs to be split across pages
-                buf[off..(off + thislen)].copy_from_slice(&len_buf[..thislen]);
-                blknum += 1;
-                buf = self.get_buf_for_write(blknum)?;
-                buf[0..4 - thislen].copy_from_slice(&len_buf[thislen..]);
-                off = 4 - thislen;
-            } else {
-                buf[off..off + 4].copy_from_slice(&len_buf);
-                off += 4;
-            }
-        }
-
-        // Write the payload
-        let mut buf_remain = srcbuf;
-        while !buf_remain.is_empty() {
-            let mut page_remain = PAGE_SZ - off;
-            if page_remain == 0 {
-                blknum += 1;
-                buf = self.get_buf_for_write(blknum)?;
-                off = 0;
-                page_remain = PAGE_SZ;
-            }
-            let this_blk_len = min(page_remain, buf_remain.len());
-            buf[off..(off + this_blk_len)].copy_from_slice(&buf_remain[..this_blk_len]);
-            off += this_blk_len;
-            buf_remain = &buf_remain[this_blk_len..];
-        }
-        drop(buf);
-
-        if srcbuf.len() < 0x80 {
-            self.size += 1;
-        } else {
-            self.size += 4;
-        }
-        self.size += srcbuf.len() as u64;
-
-        Ok(pos)
-    }
-}
-
 impl Drop for EphemeralFile {
    fn drop(&mut self) {
        // drop all pages from page cache
        let cache = page_cache::get();
-        cache.drop_buffers_for_ephemeral(self.file_id);
-
-        // remove entry from the hash map
-        EPHEMERAL_FILES.write().unwrap().files.remove(&self.file_id);
+        cache.drop_buffers_for_immutable(self.page_cache_file_id);

        // unlink the file
        let res = std::fs::remove_file(&self.file.path);
        if let Err(e) = res {
-            warn!(
-                "could not remove ephemeral file '{}': {}",
-                self.file.path.display(),
-                e
-            );
-        }
-    }
-}
-
-pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), io::Error> {
-    if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
-        match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) {
-            Ok(_) => Ok(()),
-            Err(e) => Err(io::Error::new(
-                ErrorKind::Other,
-                format!(
-                    "failed to write back to ephemeral file at {} error: {}",
-                    file.path.display(),
+            if e.kind() != std::io::ErrorKind::NotFound {
+                // just never log the not found errors, we cannot do anything for them; on detach
+                // the tenant directory is already gone.
+                //
+                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
+                error!(
+                    "could not remove ephemeral file '{}': {}",
+                    self.file.path.display(),
                    e
-                ),
-            )),
+                );
+            }
        }
-    } else {
-        Err(io::Error::new(
-            ErrorKind::Other,
-            "could not write back page, not found in ephemeral files hash",
-        ))
    }
 }

 impl BlockReader for EphemeralFile {
-    type BlockLease = page_cache::PageReadGuard<'static>;
-
-    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, io::Error> {
-        // Look up the right page
-        let cache = page_cache::get();
-        loop {
-            match cache
-                .read_ephemeral_buf(self.file_id, blknum)
-                .map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))?
-            {
-                ReadBufResult::Found(guard) => return Ok(guard),
-                ReadBufResult::NotFound(mut write_guard) => {
-                    // Read the page from disk into the buffer
-                    self.fill_buffer(write_guard.deref_mut(), blknum)?;
-                    write_guard.mark_valid();
-
-                    // Swap for read lock
-                    continue;
-                }
-            };
-        }
+    fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
+        BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
    }
 }

-fn to_io_error(e: anyhow::Error, context: &str) -> io::Error {
-    io::Error::new(ErrorKind::Other, format!("{context}: {e:#}"))
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::tenant::blob_io::BlobWriter;
-    use crate::tenant::block_io::BlockCursor;
-    use rand::{seq::SliceRandom, thread_rng, RngCore};
+    use crate::tenant::block_io::{BlockCursor, BlockReaderRef};
+    use rand::{thread_rng, RngCore};
    use std::fs;
    use std::str::FromStr;

@@ -351,78 +274,43 @@ mod tests {
        Ok((conf, tenant_id, timeline_id))
    }

-    // Helper function to slurp contents of a file, starting at the current position,
-    // into a string
-    fn read_string(efile: &EphemeralFile, offset: u64, len: usize) -> Result<String, io::Error> {
-        let mut buf = Vec::new();
-        buf.resize(len, 0u8);
-
-        efile.read_exact_at(&mut buf, offset)?;
-
-        Ok(String::from_utf8_lossy(&buf)
-            .trim_end_matches('\0')
-            .to_string())
-    }
-
-    #[test]
-    fn test_ephemeral_files() -> Result<(), io::Error> {
-        let (conf, tenant_id, timeline_id) = harness("ephemeral_files")?;
-
-        let file_a = EphemeralFile::create(conf, tenant_id, timeline_id)?;
-
-        file_a.write_all_at(b"foo", 0)?;
-        assert_eq!("foo", read_string(&file_a, 0, 20)?);
-
-        file_a.write_all_at(b"bar", 3)?;
-        assert_eq!("foobar", read_string(&file_a, 0, 20)?);
-
-        // Open a lot of files, enough to cause some page evictions.
-        let mut efiles = Vec::new();
-        for fileno in 0..100 {
-            let efile = EphemeralFile::create(conf, tenant_id, timeline_id)?;
-            efile.write_all_at(format!("file {}", fileno).as_bytes(), 0)?;
-            assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?);
-            efiles.push((fileno, efile));
-        }
-
-        // Check that all the files can still be read from. Use them in random order for
-        // good measure.
-        efiles.as_mut_slice().shuffle(&mut thread_rng());
-        for (fileno, efile) in efiles.iter_mut() {
-            assert_eq!(format!("file {}", fileno), read_string(efile, 0, 10)?);
-        }
-
-        Ok(())
-    }
-
-    #[test]
-    fn test_ephemeral_blobs() -> Result<(), io::Error> {
+    #[tokio::test]
+    async fn test_ephemeral_blobs() -> Result<(), io::Error> {
        let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;

        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;

-        let pos_foo = file.write_blob(b"foo")?;
-        assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
-        let pos_bar = file.write_blob(b"bar")?;
-        assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
-        assert_eq!(b"bar", file.block_cursor().read_blob(pos_bar)?.as_slice());
+        let pos_foo = file.write_blob(b"foo").await?;
+        assert_eq!(
+            b"foo",
+            file.block_cursor().read_blob(pos_foo).await?.as_slice()
+        );
+        let pos_bar = file.write_blob(b"bar").await?;
+        assert_eq!(
+            b"foo",
+            file.block_cursor().read_blob(pos_foo).await?.as_slice()
+        );
+        assert_eq!(
+            b"bar",
+            file.block_cursor().read_blob(pos_bar).await?.as_slice()
+        );

        let mut blobs = Vec::new();
        for i in 0..10000 {
            let data = Vec::from(format!("blob{}", i).as_bytes());
-            let pos = file.write_blob(&data)?;
+            let pos = file.write_blob(&data).await?;
            blobs.push((pos, data));
        }
        // also test with a large blobs
        for i in 0..100 {
            let data = format!("blob{}", i).as_bytes().repeat(100);
-            let pos = file.write_blob(&data)?;
+            let pos = file.write_blob(&data).await?;
            blobs.push((pos, data));
        }

-        let mut cursor = BlockCursor::new(&file);
+        let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file));
        for (pos, expected) in blobs {
-            let actual = cursor.read_blob(pos)?;
+            let actual = cursor.read_blob(pos).await?;
            assert_eq!(actual, expected);
        }

@@ -430,8 +318,8 @@ mod tests {
        let mut large_data = Vec::new();
        large_data.resize(20000, 0);
        thread_rng().fill_bytes(&mut large_data);
-        let pos_large = file.write_blob(&large_data)?;
-        let result = file.block_cursor().read_blob(pos_large)?;
+        let pos_large = file.write_blob(&large_data).await?;
+        let result = file.block_cursor().read_blob(pos_large).await?;
        assert_eq!(result, large_data);

        Ok(())
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -50,7 +50,6 @@ use crate::context::RequestContext;
 use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
-use crate::tenant::storage_layer::Layer;
 use anyhow::Result;
 use std::collections::VecDeque;
 use std::ops::Range;
@@ -121,7 +120,7 @@ impl BatchedUpdates<'_> {
    ///
    /// This should be called when the corresponding file on disk has been deleted.
    ///
-    pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc) {
+    pub fn remove_historic(&mut self, layer_desc: &PersistentLayerDesc) {
        self.layer_map.remove_historic_noflush(layer_desc)
    }

@@ -253,11 +252,11 @@ impl LayerMap {
    ///
    /// Helper function for BatchedUpdates::remove_historic
    ///
-    pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
+    pub fn remove_historic_noflush(&mut self, layer_desc: &PersistentLayerDesc) {
        self.historic
-            .remove(historic_layer_coverage::LayerKey::from(&layer_desc));
+            .remove(historic_layer_coverage::LayerKey::from(layer_desc));
        let layer_key = layer_desc.key();
-        if Self::is_l0(&layer_desc) {
+        if Self::is_l0(layer_desc) {
            let len_before = self.l0_delta_layers.len();
            let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
            l0_delta_layers.retain(|other| other.key() != layer_key);
@@ -640,148 +639,10 @@ impl LayerMap {
        }

        println!("historic_layers:");
-        for layer in self.iter_historic_layers() {
-            layer.dump(verbose, ctx)?;
+        for desc in self.iter_historic_layers() {
+            desc.dump();
        }
        println!("End dump LayerMap");
        Ok(())
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use super::LayerMap;
-    use crate::tenant::storage_layer::LayerFileName;
-    use std::str::FromStr;
-    use std::sync::Arc;
-
-    mod l0_delta_layers_updated {
-
-        use crate::tenant::{
-            storage_layer::{AsLayerDesc, PersistentLayerDesc},
-            timeline::layer_manager::LayerFileManager,
-        };
-
-        use super::*;
-
-        struct LayerObject(PersistentLayerDesc);
-
-        impl AsLayerDesc for LayerObject {
-            fn layer_desc(&self) -> &PersistentLayerDesc {
-                &self.0
-            }
-        }
-
-        impl LayerObject {
-            fn new(desc: PersistentLayerDesc) -> Self {
-                LayerObject(desc)
-            }
-        }
-
-        type TestLayerFileManager = LayerFileManager<LayerObject>;
-
-        #[test]
-        fn for_full_range_delta() {
-            // l0_delta_layers are used by compaction, and should observe all buffered updates
-            l0_delta_layers_updated_scenario(
-                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
-                 true
-             )
-        }
-
-        #[test]
-        fn for_non_full_range_delta() {
-            // has minimal uncovered areas compared to l0_delta_layers_updated_on_insert_replace_remove_for_full_range_delta
-            l0_delta_layers_updated_scenario(
-                 "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
-                 // because not full range
-                 false
-             )
-        }
-
-        #[test]
-        fn for_image() {
-            l0_delta_layers_updated_scenario(
-                 "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
-                 // code only checks if it is a full range layer, doesn't care about images, which must
-                 // mean we should in practice never have full range images
-                 false
-             )
-        }
-
-        #[test]
-        fn replacing_missing_l0_is_notfound() {
-            // original impl had an oversight, and L0 was an anyhow::Error. anyhow::Error should
-            // however only happen for precondition failures.
-
-            let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
-            let layer = LayerFileName::from_str(layer).unwrap();
-            let layer = PersistentLayerDesc::from(layer);
-
-            // same skeletan construction; see scenario below
-            let not_found = Arc::new(LayerObject::new(layer.clone()));
-            let new_version = Arc::new(LayerObject::new(layer));
-
-            // after the immutable storage state refactor, the replace operation
-            // will not use layer map any more. We keep it here for consistency in test cases
-            // and can remove it in the future.
-            let _map = LayerMap::default();
-
-            let mut mapping = TestLayerFileManager::new();
-
-            mapping
-                .replace_and_verify(not_found, new_version)
-                .unwrap_err();
-        }
-
-        fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
-            let name = LayerFileName::from_str(layer_name).unwrap();
-            let skeleton = PersistentLayerDesc::from(name);
-
-            let remote = Arc::new(LayerObject::new(skeleton.clone()));
-            let downloaded = Arc::new(LayerObject::new(skeleton));
-
-            let mut map = LayerMap::default();
-            let mut mapping = LayerFileManager::new();
-
-            // two disjoint Arcs in different lifecycle phases. even if it seems they must be the
-            // same layer, we use LayerMap::compare_arced_layers as the identity of layers.
-            assert_eq!(remote.layer_desc(), downloaded.layer_desc());
-
-            let expected_in_counts = (1, usize::from(expected_l0));
-
-            map.batch_update()
-                .insert_historic(remote.layer_desc().clone());
-            mapping.insert(remote.clone());
-            assert_eq!(
-                count_layer_in(&map, remote.layer_desc()),
-                expected_in_counts
-            );
-
-            mapping
-                .replace_and_verify(remote, downloaded.clone())
-                .expect("name derived attributes are the same");
-            assert_eq!(
-                count_layer_in(&map, downloaded.layer_desc()),
-                expected_in_counts
-            );
-
-            map.batch_update()
-                .remove_historic(downloaded.layer_desc().clone());
-            assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
-        }
-
-        fn count_layer_in(map: &LayerMap, layer: &PersistentLayerDesc) -> (usize, usize) {
-            let historic = map
-                .iter_historic_layers()
-                .filter(|x| x.key() == layer.key())
-                .count();
-            let l0s = map
-                .get_level0_deltas()
-                .expect("why does this return a result");
-            let l0 = l0s.iter().filter(|x| x.key() == layer.key()).count();
-
-            (historic, l0)
-        }
-    }
-}
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -12,7 +12,7 @@ use std::fs::{File, OpenOptions};
 use std::io::{self, Write};

 use anyhow::{bail, ensure, Context};
-use serde::{Deserialize, Serialize};
+use serde::{de::Error, Deserialize, Serialize, Serializer};
 use thiserror::Error;
 use tracing::info_span;
 use utils::bin_ser::SerializeError;
@@ -232,6 +232,28 @@ impl TimelineMetadata {
    }
 }

+impl<'de> Deserialize<'de> for TimelineMetadata {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        let bytes = Vec::<u8>::deserialize(deserializer)?;
+        Self::from_bytes(bytes.as_slice()).map_err(|e| D::Error::custom(format!("{e}")))
+    }
+}
+
+impl Serialize for TimelineMetadata {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        let bytes = self
+            .to_bytes()
+            .map_err(|e| serde::ser::Error::custom(format!("{e}")))?;
+        bytes.serialize(serializer)
+    }
+}
+
 /// Save timeline metadata to file
 pub fn save_metadata(
    conf: &'static PageServerConf,
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -20,17 +20,20 @@ use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
+use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};

 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};

+use super::delete::DeleteTenantError;
 use super::timeline::delete::DeleteTimelineFlow;
+use super::TenantSharedResources;

 /// The tenants known to the pageserver.
 /// The enum variants are used to distinguish the different states that the pageserver can be in.
-enum TenantsMap {
+pub(crate) enum TenantsMap {
    /// [`init_tenant_mgr`] is not done yet.
    Initializing,
    /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
@@ -42,13 +45,13 @@ enum TenantsMap {
 }

 impl TenantsMap {
-    fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
+    pub(crate) fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
        match self {
            TenantsMap::Initializing => None,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.get(tenant_id),
        }
    }
-    fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
+    pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
        match self {
            TenantsMap::Initializing => None,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id),
@@ -64,8 +67,7 @@ static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::
 #[instrument(skip_all)]
 pub async fn init_tenant_mgr(
    conf: &'static PageServerConf,
-    broker_client: storage_broker::BrokerClientChannel,
-    remote_storage: Option<GenericRemoteStorage>,
+    resources: TenantSharedResources,
    init_order: InitializationOrder,
 ) -> anyhow::Result<()> {
    // Scan local filesystem for attached tenants
@@ -97,7 +99,9 @@ pub async fn init_tenant_mgr(
                        );
                    }
                } else {
-                    // This case happens if we crash during attach before creating the attach marker file
+                    // This case happens if we:
+                    // * crash during attach before creating the attach marker file
+                    // * crash during tenant delete before removing tenant directory
                    let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
                        format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
                    })?;
@@ -121,9 +125,9 @@ pub async fn init_tenant_mgr(
                    match schedule_local_tenant_processing(
                        conf,
                        &tenant_dir_path,
-                        broker_client.clone(),
-                        remote_storage.clone(),
+                        resources.clone(),
                        Some(init_order.clone()),
+                        &TENANTS,
                        &ctx,
                    ) {
                        Ok(tenant) => {
@@ -154,12 +158,12 @@ pub async fn init_tenant_mgr(
    Ok(())
 }

-pub fn schedule_local_tenant_processing(
+pub(crate) fn schedule_local_tenant_processing(
    conf: &'static PageServerConf,
    tenant_path: &Path,
-    broker_client: storage_broker::BrokerClientChannel,
-    remote_storage: Option<GenericRemoteStorage>,
+    resources: TenantSharedResources,
    init_order: Option<InitializationOrder>,
+    tenants: &'static tokio::sync::RwLock<TenantsMap>,
    ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
    anyhow::ensure!(
@@ -194,8 +198,15 @@ pub fn schedule_local_tenant_processing(

    let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
        info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
-        if let Some(remote_storage) = remote_storage {
-            match Tenant::spawn_attach(conf, tenant_id, broker_client, remote_storage, ctx) {
+        if let Some(remote_storage) = resources.remote_storage {
+            match Tenant::spawn_attach(
+                conf,
+                tenant_id,
+                resources.broker_client,
+                tenants,
+                remote_storage,
+                ctx,
+            ) {
                Ok(tenant) => tenant,
                Err(e) => {
                    error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
@@ -213,14 +224,7 @@ pub fn schedule_local_tenant_processing(
    } else {
        info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
        // Start loading the tenant into memory. It will initially be in Loading state.
-        Tenant::spawn_load(
-            conf,
-            tenant_id,
-            broker_client,
-            remote_storage,
-            init_order,
-            ctx,
-        )
+        Tenant::spawn_load(conf, tenant_id, resources, init_order, tenants, ctx)
    };
    Ok(tenant)
 }
@@ -266,71 +270,77 @@ async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock<TenantsMap>) {
        }
    };

+    let started_at = std::time::Instant::now();
    let mut join_set = JoinSet::new();
    for (tenant_id, tenant) in tenants_to_shut_down {
        join_set.spawn(
            async move {
-                // ordering shouldn't matter for this, either we store true right away or never
-                let ordering = std::sync::atomic::Ordering::Relaxed;
-                let joined_other = std::sync::atomic::AtomicBool::new(false);
+                let freeze_and_flush = true;

-                let mut shutdown = std::pin::pin!(async {
-                    let freeze_and_flush = true;
-
-                    let res = {
-                        let (_guard, shutdown_progress) = completion::channel();
-                        tenant.shutdown(shutdown_progress, freeze_and_flush).await
-                    };
-
-                    if let Err(other_progress) = res {
-                        // join the another shutdown in progress
-                        joined_other.store(true, ordering);
-                        other_progress.wait().await;
-                    }
-                });
-
-                // in practice we might not have a lot time to go, since systemd is going to
-                // SIGKILL us at 10s, but we can try. delete tenant might take a while, so put out
-                // a warning.
-                let warning = std::time::Duration::from_secs(5);
-                let mut warning = std::pin::pin!(tokio::time::sleep(warning));
-
-                tokio::select! {
-                    _ = &mut shutdown => {},
-                    _ = &mut warning => {
-                        let joined_other = joined_other.load(ordering);
-                        warn!(%joined_other, "waiting for the shutdown to complete");
-                        shutdown.await;
-                    }
+                let res = {
+                    let (_guard, shutdown_progress) = completion::channel();
+                    tenant.shutdown(shutdown_progress, freeze_and_flush).await
                };

+                if let Err(other_progress) = res {
+                    // join the another shutdown in progress
+                    other_progress.wait().await;
+                }
+
+                // we cannot afford per tenant logging here, because if s3 is degraded, we are
+                // going to log too many lines
+
                debug!("tenant successfully stopped");
            }
            .instrument(info_span!("shutdown", %tenant_id)),
        );
    }

+    let total = join_set.len();
    let mut panicked = 0;
+    let mut buffering = true;
+    const BUFFER_FOR: std::time::Duration = std::time::Duration::from_millis(500);
+    let mut buffered = std::pin::pin!(tokio::time::sleep(BUFFER_FOR));

-    while let Some(res) = join_set.join_next().await {
-        match res {
-            Ok(()) => {}
-            Err(join_error) if join_error.is_cancelled() => {
-                unreachable!("we are not cancelling any of the futures");
-            }
-            Err(join_error) if join_error.is_panic() => {
-                // cannot really do anything, as this panic is likely a bug
-                panicked += 1;
-            }
-            Err(join_error) => {
-                warn!("unknown kind of JoinError: {join_error}");
+    while !join_set.is_empty() {
+        tokio::select! {
+            Some(joined) = join_set.join_next() => {
+                match joined {
+                    Ok(()) => {}
+                    Err(join_error) if join_error.is_cancelled() => {
+                        unreachable!("we are not cancelling any of the futures");
+                    }
+                    Err(join_error) if join_error.is_panic() => {
+                        // cannot really do anything, as this panic is likely a bug
+                        panicked += 1;
+                    }
+                    Err(join_error) => {
+                        warn!("unknown kind of JoinError: {join_error}");
+                    }
+                }
+                if !buffering {
+                    // buffer so that every 500ms since the first update (or starting) we'll log
+                    // how far away we are; this is because we will get SIGKILL'd at 10s, and we
+                    // are not able to log *then*.
+                    buffering = true;
+                    buffered.as_mut().reset(tokio::time::Instant::now() + BUFFER_FOR);
+                }
+            },
+            _ = &mut buffered, if buffering => {
+                buffering = false;
+                info!(remaining = join_set.len(), total, elapsed_ms = started_at.elapsed().as_millis(), "waiting for tenants to shutdown");
            }
        }
    }

    if panicked > 0 {
-        warn!(panicked, "observed panicks while shutting down tenants");
+        warn!(
+            panicked,
+            total, "observed panicks while shutting down tenants"
+        );
    }
+
+    // caller will log how long we took
 }

 pub async fn create_tenant(
@@ -349,8 +359,12 @@ pub async fn create_tenant(
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233

+        let tenant_resources = TenantSharedResources {
+            broker_client,
+            remote_storage,
+        };
        let created_tenant =
-            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, ctx)?;
+            schedule_local_tenant_processing(conf, &tenant_directory, tenant_resources, None, &TENANTS, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

@@ -411,6 +425,14 @@ pub async fn get_tenant(
    }
 }

+pub async fn delete_tenant(
+    conf: &'static PageServerConf,
+    remote_storage: Option<GenericRemoteStorage>,
+    tenant_id: TenantId,
+) -> Result<(), DeleteTenantError> {
+    DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant_id).await
+}
+
 #[derive(Debug, thiserror::Error)]
 pub enum DeleteTimelineError {
    #[error("Tenant {0}")]
@@ -426,7 +448,7 @@ pub async fn delete_timeline(
    _ctx: &RequestContext,
 ) -> Result<(), DeleteTimelineError> {
    let tenant = get_tenant(tenant_id, true).await?;
-    DeleteTimelineFlow::run(&tenant, timeline_id).await?;
+    DeleteTimelineFlow::run(&tenant, timeline_id, false).await?;
    Ok(())
 }

@@ -501,7 +523,11 @@ pub async fn load_tenant(
                .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
        }

-        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, ctx)
+        let resources = TenantSharedResources {
+            broker_client,
+            remote_storage,
+        };
+        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path,  resources, None,  &TENANTS, ctx)
            .with_context(|| {
                format!("Failed to schedule tenant processing in path {tenant_path:?}")
            })?;
@@ -582,7 +608,11 @@ pub async fn attach_tenant(
            .context("check for attach marker file existence")?;
        anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");

-        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, ctx)?;
+        let resources = TenantSharedResources {
+            broker_client,
+            remote_storage: Some(remote_storage),
+        };
+        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, resources, None, &TENANTS, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -135,7 +135,7 @@
 //! - Initiate upload queue with that [`IndexPart`].
 //! - Reschedule all lost operations by comparing the local filesystem state
 //!   and remote state as per [`IndexPart`]. This is done in
-//!   [`Tenant::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
+//!   [`Tenant::timeline_init_and_sync`].
 //!
 //! Note that if we crash during file deletion between the index update
 //! that removes the file from the list of files, and deleting the remote file,
@@ -163,8 +163,6 @@
 //!   - download their remote [`IndexPart`]s
 //!   - create `Timeline` struct and a `RemoteTimelineClient`
 //!   - initialize the client's upload queue with its `IndexPart`
-//!   - create [`RemoteLayer`](super::storage_layer::RemoteLayer) instances
-//!     for layers that are referenced by `IndexPart` but not present locally
 //!   - schedule uploads for layers that are only present locally.
 //!   - if the remote `IndexPart`'s metadata was newer than the metadata in
 //!     the local filesystem, write the remote metadata to the local filesystem
@@ -172,7 +170,6 @@
 //!   transitioning it from `TenantState::Attaching` to `TenantState::Active` state.
 //!   This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops.
 //!
-//! Most of the above steps happen in [`Timeline::reconcile_with_remote`] or its callers.
 //! We keep track of the fact that a client is in `Attaching` state in a marker
 //! file on the local disk. This is critical because, when we restart the pageserver,
 //! we do not want to do the `List timelines` step for each tenant that has already
@@ -192,14 +189,14 @@
 //! not created and the uploads are skipped.
 //! Theoretically, it should be ok to remove and re-add remote storage configuration to
 //! the pageserver config at any time, since it doesn't make a difference to
-//! `reconcile_with_remote`.
+//! [`Timeline::load_layer_map`].
 //! Of course, the remote timeline dir must not change while we have de-configured
 //! remote storage, i.e., the pageserver must remain the owner of the given prefix
 //! in remote storage.
 //! But note that we don't test any of this right now.
 //!
 //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
-//! [`Timeline::reconcile_with_remote`]: super::Timeline::reconcile_with_remote
+//! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map

 mod delete;
 mod download;
@@ -211,6 +208,10 @@ use chrono::{NaiveDateTime, Utc};
 // re-export these
 pub use download::{is_temp_download_file, list_remote_timelines};
 use scopeguard::ScopeGuard;
+use tokio_util::sync::CancellationToken;
+use utils::backoff::{
+    self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
+};

 use std::collections::{HashMap, VecDeque};
 use std::path::Path;
@@ -219,7 +220,6 @@ use std::sync::{Arc, Mutex};

 use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use std::ops::DerefMut;
-use tokio::runtime::Runtime;
 use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;
@@ -229,8 +229,10 @@ use crate::metrics::{
    RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
    REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
 };
+use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+pub(crate) use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::storage_layer::AsLayerDesc;
 use crate::tenant::upload_queue::Delete;
 use crate::{
    config::PageServerConf,
@@ -241,14 +243,13 @@ use crate::{
    tenant::upload_queue::{
        UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
    },
-    {exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS},
 };

 use utils::id::{TenantId, TimelineId};

 use self::index::IndexPart;

-use super::storage_layer::LayerFileName;
+use super::storage_layer::{LayerFileName, ResidentLayer};
 use super::upload_queue::SetDeletedFlagProgress;

 // Occasional network issues and such can cause remote operations to fail, and
@@ -256,12 +257,12 @@ use super::upload_queue::SetDeletedFlagProgress;
 // But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
 // level instead, as repeated failures can mean a more serious problem. If it
 // fails more than FAILED_DOWNLOAD_RETRIES times, we give up
-const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
-const FAILED_DOWNLOAD_RETRIES: u32 = 10;
+pub(crate) const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
+pub(crate) const FAILED_REMOTE_OP_RETRIES: u32 = 10;

 // Similarly log failed uploads and deletions at WARN level, after this many
 // retries. Uploads and deletions are retried forever, though.
-const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
+pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;

 pub enum MaybeDeletedIndexPart {
    IndexPart(IndexPart),
@@ -309,7 +310,7 @@ pub enum PersistIndexPartWithDeletedFlagError {
 pub struct RemoteTimelineClient {
    conf: &'static PageServerConf,

-    runtime: &'static Runtime,
+    runtime: tokio::runtime::Handle,

    tenant_id: TenantId,
    timeline_id: TimelineId,
@@ -336,7 +337,7 @@ impl RemoteTimelineClient {
    ) -> RemoteTimelineClient {
        RemoteTimelineClient {
            conf,
-            runtime: &BACKGROUND_RUNTIME,
+            runtime: BACKGROUND_RUNTIME.handle().to_owned(),
            tenant_id,
            timeline_id,
            storage_impl: remote_storage,
@@ -352,6 +353,10 @@ impl RemoteTimelineClient {
        let mut upload_queue = self.upload_queue.lock().unwrap();
        upload_queue.initialize_with_current_remote_index_part(index_part)?;
        self.update_remote_physical_size_gauge(Some(index_part));
+        info!(
+            "initialized upload queue from remote index with {} layer files",
+            index_part.layer_metadata.len()
+        );
        Ok(())
    }

@@ -364,6 +369,7 @@ impl RemoteTimelineClient {
        let mut upload_queue = self.upload_queue.lock().unwrap();
        upload_queue.initialize_empty_remote(local_metadata)?;
        self.update_remote_physical_size_gauge(None);
+        info!("initialized upload queue as empty");
        Ok(())
    }

@@ -534,8 +540,7 @@ impl RemoteTimelineClient {
        // ahead of what's _actually_ on the remote during index upload.
        upload_queue.latest_metadata = metadata.clone();

-        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
-        self.schedule_index_upload(upload_queue, metadata_bytes);
+        self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());

        Ok(())
    }
@@ -555,8 +560,7 @@ impl RemoteTimelineClient {
        let upload_queue = guard.initialized_mut()?;

        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
-            self.schedule_index_upload(upload_queue, metadata_bytes);
+            self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
        }

        Ok(())
@@ -566,7 +570,7 @@ impl RemoteTimelineClient {
    fn schedule_index_upload(
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
-        metadata_bytes: Vec<u8>,
+        metadata: TimelineMetadata,
    ) {
        info!(
            "scheduling metadata upload with {} files ({} changed)",
@@ -579,7 +583,7 @@ impl RemoteTimelineClient {
        let index_part = IndexPart::new(
            upload_queue.latest_files.clone(),
            disk_consistent_lsn,
-            metadata_bytes,
+            metadata,
        );
        let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
        self.calls_unfinished_metric_begin(&op);
@@ -593,25 +597,25 @@ impl RemoteTimelineClient {
    ///
    /// Launch an upload operation in the background.
    ///
-    pub fn schedule_layer_file_upload(
+    pub(crate) fn schedule_layer_file_upload(
        self: &Arc<Self>,
-        layer_file_name: &LayerFileName,
-        layer_metadata: &LayerFileMetadata,
+        layer: ResidentLayer,
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

+        let metadata = LayerFileMetadata::new(layer.layer_desc().file_size);
+
        upload_queue
            .latest_files
-            .insert(layer_file_name.clone(), layer_metadata.clone());
+            .insert(layer.layer_desc().filename(), metadata.clone());
        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;

-        let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
+        info!("scheduled layer file upload {layer}");
+        let op = UploadOp::UploadLayer(layer, metadata);
        self.calls_unfinished_metric_begin(&op);
        upload_queue.queued_operations.push_back(op);

-        info!("scheduled layer file upload {layer_file_name}");
-
        // Launch the task immediately, if possible
        self.launch_queued_tasks(upload_queue);
        Ok(())
@@ -635,7 +639,7 @@ impl RemoteTimelineClient {

        // Deleting layers doesn't affect the values stored in TimelineMetadata,
        // so we don't need update it. Just serialize it.
-        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
+        let metadata = upload_queue.latest_metadata.clone();

        // Update the remote index file, removing the to-be-deleted files from the index,
        // before deleting the actual files.
@@ -646,12 +650,13 @@ impl RemoteTimelineClient {
        // to syntactically forbid ? or bail! calls here.
        let no_bail_here = || {
            for name in names {
-                upload_queue.latest_files.remove(name);
-                upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
+                if upload_queue.latest_files.remove(name).is_some() {
+                    upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
+                }
            }

            if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-                self.schedule_index_upload(upload_queue, metadata_bytes);
+                self.schedule_index_upload(upload_queue, metadata);
            }

            // schedule the actual deletions
@@ -752,12 +757,25 @@ impl RemoteTimelineClient {

        pausable_failpoint!("persist_deleted_index_part");

-        upload::upload_index_part(
-            self.conf,
-            &self.storage_impl,
-            &self.tenant_id,
-            &self.timeline_id,
-            &index_part_with_deleted_at,
+        backoff::retry(
+            || {
+                upload::upload_index_part(
+                    self.conf,
+                    &self.storage_impl,
+                    &self.tenant_id,
+                    &self.timeline_id,
+                    &index_part_with_deleted_at,
+                )
+            },
+            |_e| false,
+            1,
+            // have just a couple of attempts
+            // when executed as part of timeline deletion this happens in context of api call
+            // when executed as part of tenant deletion this happens in the background
+            2,
+            "persist_index_part_with_deleted_flag",
+            // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
        )
        .await?;

@@ -834,10 +852,20 @@ impl RemoteTimelineClient {
        let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
        let timeline_storage_path = self.conf.remote_path(&timeline_path)?;

-        let remaining = self
-            .storage_impl
-            .list_prefixes(Some(&timeline_storage_path))
-            .await?;
+        let remaining = backoff::retry(
+            || async {
+                self.storage_impl
+                    .list_files(Some(&timeline_storage_path))
+                    .await
+            },
+            |_e| false,
+            FAILED_DOWNLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "list_prefixes",
+            backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
+        )
+        .await
+        .context("list prefixes")?;

        let remaining: Vec<RemotePath> = remaining
            .into_iter()
@@ -852,7 +880,16 @@ impl RemoteTimelineClient {
            .collect();

        if !remaining.is_empty() {
-            self.storage_impl.delete_objects(&remaining).await?;
+            backoff::retry(
+                || async { self.storage_impl.delete_objects(&remaining).await },
+                |_e| false,
+                FAILED_UPLOAD_WARN_THRESHOLD,
+                FAILED_REMOTE_OP_RETRIES,
+                "delete_objects",
+                backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
+            )
+            .await
+            .context("delete_objects")?;
        }

        fail::fail_point!("timeline-delete-before-index-delete", |_| {
@@ -864,7 +901,17 @@ impl RemoteTimelineClient {
        let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));

        debug!("deleting index part");
-        self.storage_impl.delete(&index_file_path).await?;
+
+        backoff::retry(
+            || async { self.storage_impl.delete(&index_file_path).await },
+            |_e| false,
+            FAILED_UPLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "delete_index",
+            backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled")),
+        )
+        .await
+        .context("delete_index")?;

        fail::fail_point!("timeline-delete-after-index-delete", |_| {
            Err(anyhow::anyhow!(
@@ -954,7 +1001,7 @@ impl RemoteTimelineClient {
            let tenant_id = self.tenant_id;
            let timeline_id = self.timeline_id;
            task_mgr::spawn(
-                self.runtime.handle(),
+                &self.runtime,
                TaskKind::RemoteUploadTask,
                Some(self.tenant_id),
                Some(self.timeline_id),
@@ -1006,11 +1053,8 @@ impl RemoteTimelineClient {
            }

            let upload_result: anyhow::Result<()> = match &task.op {
-                UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => {
-                    let path = &self
-                        .conf
-                        .timeline_path(&self.tenant_id, &self.timeline_id)
-                        .join(layer_file_name.file_name());
+                UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
+                    let path = layer.local_path();
                    upload::upload_timeline_layer(
                        self.conf,
                        &self.storage_impl,
@@ -1027,6 +1071,15 @@ impl RemoteTimelineClient {
                    .await
                }
                UploadOp::UploadMetadata(ref index_part, _lsn) => {
+                    let mention_having_future_layers = if cfg!(feature = "testing") {
+                        index_part
+                            .layer_metadata
+                            .keys()
+                            .any(|x| x.is_in_future(*_lsn))
+                    } else {
+                        false
+                    };
+
                    let res = upload::upload_index_part(
                        self.conf,
                        &self.storage_impl,
@@ -1044,6 +1097,10 @@ impl RemoteTimelineClient {
                    .await;
                    if res.is_ok() {
                        self.update_remote_physical_size_gauge(Some(index_part));
+                        if mention_having_future_layers {
+                            // find rationale near crate::tenant::timeline::init::cleanup_future_layer
+                            tracing::info!(disk_consistent_lsn=%_lsn, "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup");
+                        }
                    }
                    res
                }
@@ -1095,14 +1152,13 @@ impl RemoteTimelineClient {
                    }

                    // sleep until it's time to retry, or we're cancelled
-                    tokio::select! {
-                        _ = task_mgr::shutdown_watcher() => { },
-                        _ = exponential_backoff(
-                            retries,
-                            DEFAULT_BASE_BACKOFF_SECONDS,
-                            DEFAULT_MAX_BACKOFF_SECONDS,
-                        ) => { },
-                    };
+                    exponential_backoff(
+                        retries,
+                        DEFAULT_BASE_BACKOFF_SECONDS,
+                        DEFAULT_MAX_BACKOFF_SECONDS,
+                        &shutdown_token(),
+                    )
+                    .await;
                }
            }
        }
@@ -1307,7 +1363,8 @@ mod tests {
        context::RequestContext,
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
-            Tenant,
+            storage_layer::Layer,
+            Tenant, Timeline,
        },
        DEFAULT_PG_VERSION,
    };
@@ -1316,7 +1373,6 @@ mod tests {
        collections::HashSet,
        path::{Path, PathBuf},
    };
-    use tokio::runtime::EnterGuard;
    use utils::lsn::Lsn;

    pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
@@ -1366,35 +1422,25 @@ mod tests {
    }

    struct TestSetup {
-        runtime: &'static tokio::runtime::Runtime,
-        entered_runtime: EnterGuard<'static>,
        harness: TenantHarness,
        tenant: Arc<Tenant>,
+        timeline: Arc<Timeline>,
        tenant_ctx: RequestContext,
        remote_fs_dir: PathBuf,
        client: Arc<RemoteTimelineClient>,
    }

    impl TestSetup {
-        fn new(test_name: &str) -> anyhow::Result<Self> {
+        async fn new(test_name: &str) -> anyhow::Result<Self> {
            // Use a current-thread runtime in the test
-            let runtime = Box::leak(Box::new(
-                tokio::runtime::Builder::new_current_thread()
-                    .enable_all()
-                    .build()?,
-            ));
-            let entered_runtime = runtime.enter();
-
            let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
            let harness = TenantHarness::create(test_name)?;
-            let (tenant, ctx) = runtime.block_on(harness.load());
+            let (tenant, ctx) = harness.load().await;
+
            // create an empty timeline directory
-            let _ = runtime.block_on(tenant.create_test_timeline(
-                TIMELINE_ID,
-                Lsn(8),
-                DEFAULT_PG_VERSION,
-                &ctx,
-            ))?;
+            let timeline = tenant
+                .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
+                .await?;

            let remote_fs_dir = harness.conf.workdir.join("remote_fs");
            std::fs::create_dir_all(remote_fs_dir)?;
@@ -1416,7 +1462,7 @@ mod tests {

            let client = Arc::new(RemoteTimelineClient {
                conf: harness.conf,
-                runtime,
+                runtime: tokio::runtime::Handle::current(),
                tenant_id: harness.tenant_id,
                timeline_id: TIMELINE_ID,
                storage_impl: storage,
@@ -1428,10 +1474,9 @@ mod tests {
            });

            Ok(Self {
-                runtime,
-                entered_runtime,
                harness,
                tenant,
+                timeline,
                tenant_ctx: ctx,
                remote_fs_dir,
                client,
@@ -1440,8 +1485,8 @@ mod tests {
    }

    // Test scheduling
-    #[test]
-    fn upload_scheduling() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn upload_scheduling() {
        // Test outline:
        //
        // Schedule upload of a bunch of layers. Check that they are started immediately, not queued
@@ -1457,53 +1502,57 @@ mod tests {
        // Schedule index upload. Check that it's queued

        let TestSetup {
-            runtime,
-            entered_runtime: _entered_runtime,
            harness,
            tenant: _tenant,
+            timeline,
            tenant_ctx: _tenant_ctx,
            remote_fs_dir,
            client,
-        } = TestSetup::new("upload_scheduling").unwrap();
+        } = TestSetup::new("upload_scheduling").await.unwrap();

        let timeline_path = harness.timeline_path(&TIMELINE_ID);

        println!("workdir: {}", harness.conf.workdir.display());

        let remote_timeline_dir =
-            remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir)?);
+            remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap());
        println!("remote_timeline_dir: {}", remote_timeline_dir.display());

        let metadata = dummy_metadata(Lsn(0x10));
-        client.init_upload_queue_for_empty_remote(&metadata)?;
+        client
+            .init_upload_queue_for_empty_remote(&metadata)
+            .unwrap();

        // Create a couple of dummy files,  schedule upload for them
-        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
-        let layer_file_name_2: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap();
-        let layer_file_name_3: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap();
-        let content_1 = dummy_contents("foo");
-        let content_2 = dummy_contents("bar");
-        let content_3 = dummy_contents("baz");
-        std::fs::write(
-            timeline_path.join(layer_file_name_1.file_name()),
-            &content_1,
-        )?;
-        std::fs::write(
-            timeline_path.join(layer_file_name_2.file_name()),
-            &content_2,
-        )?;
-        std::fs::write(timeline_path.join(layer_file_name_3.file_name()), content_3)?;

-        client.schedule_layer_file_upload(
-            &layer_file_name_1,
-            &LayerFileMetadata::new(content_1.len() as u64),
-        )?;
-        client.schedule_layer_file_upload(
-            &layer_file_name_2,
-            &LayerFileMetadata::new(content_2.len() as u64),
-        )?;
+        let layers = [
+            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), dummy_contents("foo")),
+            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap(), dummy_contents("bar")),
+            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz"))
+        ]
+        .into_iter()
+        .map(|(name, contents): (LayerFileName, Vec<u8>)| {
+            std::fs::write(timeline_path.join(name.file_name()), &contents).unwrap();
+
+            Layer::for_resident(
+                harness.conf,
+                &timeline,
+                name,
+                LayerFileMetadata::new(contents.len() as u64),
+            )
+        }).collect::<Vec<_>>();
+
+        client
+            .schedule_layer_file_upload(layers[0].clone())
+            .unwrap();
+        client
+            .schedule_layer_file_upload(layers[1].clone())
+            .unwrap();

        // Check that they are started immediately, not queued
+        //
+        // this works because we running within block_on, so any futures are now queued up until
+        // our next await point.
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
@@ -1517,7 +1566,9 @@ mod tests {

        // Schedule upload of index. Check that it is queued
        let metadata = dummy_metadata(Lsn(0x20));
-        client.schedule_index_upload_for_metadata_update(&metadata)?;
+        client
+            .schedule_index_upload_for_metadata_update(&metadata)
+            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
@@ -1526,7 +1577,7 @@ mod tests {
        }

        // Wait for the uploads to finish
-        runtime.block_on(client.wait_completion())?;
+        client.wait_completion().await.unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
@@ -1536,29 +1587,31 @@ mod tests {
        }

        // Download back the index.json, and check that the list of files is correct
-        let index_part = match runtime.block_on(client.download_index_file())? {
+        let index_part = match client.download_index_file().await.unwrap() {
            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
        };

        assert_file_list(
-            &index_part.timeline_layers,
+            &index_part
+                .layer_metadata
+                .keys()
+                .map(|f| f.to_owned())
+                .collect(),
            &[
-                &layer_file_name_1.file_name(),
-                &layer_file_name_2.file_name(),
+                &layers[0].layer_desc().filename().file_name(),
+                &layers[1].layer_desc().filename().file_name(),
            ],
        );
-        let downloaded_metadata = index_part.parse_metadata()?;
-        assert_eq!(downloaded_metadata, metadata);
+        assert_eq!(index_part.metadata, metadata);

        // Schedule upload and then a deletion. Check that the deletion is queued
-        let content_baz = dummy_contents("baz");
-        std::fs::write(timeline_path.join("baz"), &content_baz)?;
-        client.schedule_layer_file_upload(
-            &layer_file_name_3,
-            &LayerFileMetadata::new(content_baz.len() as u64),
-        )?;
-        client.schedule_layer_file_deletion(&[layer_file_name_1.clone()])?;
+        client
+            .schedule_layer_file_upload(layers[2].clone())
+            .unwrap();
+        client
+            .schedule_layer_file_deletion(&[layers[0].layer_desc().filename()])
+            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
@@ -1572,41 +1625,42 @@ mod tests {
        }
        assert_remote_files(
            &[
-                &layer_file_name_1.file_name(),
-                &layer_file_name_2.file_name(),
+                &layers[0].layer_desc().filename().file_name(),
+                &layers[1].layer_desc().filename().file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
        );

        // Finish them
-        runtime.block_on(client.wait_completion())?;
+        client.wait_completion().await.unwrap();

        assert_remote_files(
            &[
-                &layer_file_name_2.file_name(),
-                &layer_file_name_3.file_name(),
+                &layers[1].layer_desc().filename().file_name(),
+                &layers[2].layer_desc().filename().file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
        );
-
-        Ok(())
    }

-    #[test]
-    fn bytes_unfinished_gauge_for_layer_file_uploads() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn bytes_unfinished_gauge_for_layer_file_uploads() {
        // Setup

        let TestSetup {
-            runtime,
            harness,
+            tenant: _tenant,
+            timeline,
            client,
            ..
-        } = TestSetup::new("metrics")?;
+        } = TestSetup::new("metrics").await.unwrap();

        let metadata = dummy_metadata(Lsn(0x10));
-        client.init_upload_queue_for_empty_remote(&metadata)?;
+        client
+            .init_upload_queue_for_empty_remote(&metadata)
+            .unwrap();

        let timeline_path = harness.timeline_path(&TIMELINE_ID);

@@ -1615,7 +1669,15 @@ mod tests {
        std::fs::write(
            timeline_path.join(layer_file_name_1.file_name()),
            &content_1,
-        )?;
+        )
+        .unwrap();
+
+        let layer_file_1 = Layer::for_resident(
+            harness.conf,
+            &timeline,
+            layer_file_name_1.clone(),
+            LayerFileMetadata::new(content_1.len() as u64),
+        );

        #[derive(Debug, PartialEq)]
        struct BytesStartedFinished {
@@ -1641,14 +1703,13 @@ mod tests {

        let init = get_bytes_started_stopped();

-        client.schedule_layer_file_upload(
-            &layer_file_name_1,
-            &LayerFileMetadata::new(content_1.len() as u64),
-        )?;
+        client
+            .schedule_layer_file_upload(layer_file_1.clone())
+            .unwrap();

        let pre = get_bytes_started_stopped();

-        runtime.block_on(client.wait_completion())?;
+        client.wait_completion().await.unwrap();

        let post = get_bytes_started_stopped();

@@ -1676,7 +1737,5 @@ mod tests {
                finished: Some(content_1.len())
            }
        );
-
-        Ok(())
    }
 }
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -11,23 +11,18 @@ use std::time::Duration;
 use anyhow::{anyhow, Context};
 use tokio::fs;
 use tokio::io::AsyncWriteExt;
-
-use tracing::{info, warn};
+use tokio_util::sync::CancellationToken;
+use utils::{backoff, crashsafe};

 use crate::config::PageServerConf;
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};

 use super::index::{IndexPart, LayerFileMetadata};
-use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};
-
-async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
-    fs::File::open(path).await?.sync_all().await
-}
+use super::{FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES};

 static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);

@@ -152,7 +147,7 @@ pub async fn download_layer_file<'a>(
        })
        .map_err(DownloadError::Other)?;

-    fsync_path(&local_path)
+    crashsafe::fsync_async(&local_path)
        .await
        .with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
        .map_err(DownloadError::Other)?;
@@ -268,7 +263,6 @@ pub(super) async fn download_index_part(
    Ok(index_part)
 }

-///
 /// Helper function to handle retries for a download operation.
 ///
 /// Remote operations can fail due to rate limits (IAM, S3), spurious network
@@ -276,47 +270,21 @@ pub(super) async fn download_index_part(
 /// with backoff.
 ///
 /// (See similar logic for uploads in `perform_upload_task`)
-async fn download_retry<T, O, F>(mut op: O, description: &str) -> Result<T, DownloadError>
+async fn download_retry<T, O, F>(op: O, description: &str) -> Result<T, DownloadError>
 where
    O: FnMut() -> F,
    F: Future<Output = Result<T, DownloadError>>,
 {
-    let mut attempts = 0;
-    loop {
-        let result = op().await;
-        match result {
-            Ok(_) => {
-                if attempts > 0 {
-                    info!("{description} succeeded after {attempts} retries");
-                }
-                return result;
-            }
-
-            // These are "permanent" errors that should not be retried.
-            Err(DownloadError::BadInput(_)) | Err(DownloadError::NotFound) => {
-                return result;
-            }
-            // Assume that any other failure might be transient, and the operation might
-            // succeed if we just keep trying.
-            Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_WARN_THRESHOLD => {
-                info!("{description} failed, will retry (attempt {attempts}): {err:#}");
-            }
-            Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_RETRIES => {
-                warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
-            }
-            Err(DownloadError::Other(ref err)) => {
-                // Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up.
-                warn!("{description} still failed after {attempts} retries, giving up: {err:?}");
-                return result;
-            }
-        }
-        // sleep and retry
-        exponential_backoff(
-            attempts,
-            DEFAULT_BASE_BACKOFF_SECONDS,
-            DEFAULT_MAX_BACKOFF_SECONDS,
-        )
-        .await;
-        attempts += 1;
-    }
+    backoff::retry(
+        op,
+        |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
+        FAILED_DOWNLOAD_WARN_THRESHOLD,
+        FAILED_REMOTE_OP_RETRIES,
+        description,
+        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+        backoff::Cancel::new(CancellationToken::new(), || -> DownloadError {
+            unreachable!()
+        }),
+    )
+    .await
 }
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -62,10 +62,9 @@ pub struct IndexPart {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub deleted_at: Option<NaiveDateTime>,

-    /// Layer names, which are stored on the remote storage.
-    ///
-    /// Additional metadata can might exist in `layer_metadata`.
-    pub timeline_layers: HashSet<LayerFileName>,
+    /// Legacy field: equal to the keys of `layer_metadata`, only written out for forward compat
+    #[serde(default, skip_deserializing)]
+    timeline_layers: HashSet<LayerFileName>,

    /// Per layer file name metadata, which can be present for a present or missing layer file.
    ///
@@ -74,10 +73,13 @@ pub struct IndexPart {
    pub layer_metadata: HashMap<LayerFileName, IndexLayerMetadata>,

    // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
-    // It's duplicated here for convenience.
+    // It's duplicated for convenience when reading the serialized structure, but is
+    // private because internally we would read from metadata instead.
    #[serde_as(as = "DisplayFromStr")]
-    pub disk_consistent_lsn: Lsn,
-    metadata_bytes: Vec<u8>,
+    disk_consistent_lsn: Lsn,
+
+    #[serde(rename = "metadata_bytes")]
+    pub metadata: TimelineMetadata,
 }

 impl IndexPart {
@@ -85,13 +87,17 @@ impl IndexPart {
    /// used to understand later versions.
    ///
    /// Version is currently informative only.
-    const LATEST_VERSION: usize = 2;
+    /// Version history
+    /// - 2: added `deleted_at`
+    /// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers
+    ///      is always generated from the keys of `layer_metadata`)
+    const LATEST_VERSION: usize = 3;
    pub const FILE_NAME: &'static str = "index_part.json";

    pub fn new(
        layers_and_metadata: HashMap<LayerFileName, LayerFileMetadata>,
        disk_consistent_lsn: Lsn,
-        metadata_bytes: Vec<u8>,
+        metadata: TimelineMetadata,
    ) -> Self {
        let mut timeline_layers = HashSet::with_capacity(layers_and_metadata.len());
        let mut layer_metadata = HashMap::with_capacity(layers_and_metadata.len());
@@ -107,14 +113,10 @@ impl IndexPart {
            timeline_layers,
            layer_metadata,
            disk_consistent_lsn,
-            metadata_bytes,
+            metadata,
            deleted_at: None,
        }
    }
-
-    pub fn parse_metadata(&self) -> anyhow::Result<TimelineMetadata> {
-        TimelineMetadata::from_bytes(&self.metadata_bytes)
-    }
 }

 impl TryFrom<&UploadQueueInitialized> for IndexPart {
@@ -122,12 +124,12 @@ impl TryFrom<&UploadQueueInitialized> for IndexPart {

    fn try_from(upload_queue: &UploadQueueInitialized) -> Result<Self, Self::Error> {
        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
-        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
+        let metadata = upload_queue.latest_metadata.clone();

        Ok(Self::new(
            upload_queue.latest_files.clone(),
            disk_consistent_lsn,
-            metadata_bytes,
+            metadata,
        ))
    }
 }
@@ -166,7 +168,7 @@ mod tests {
        let expected = IndexPart {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 1,
-            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
+            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
@@ -178,7 +180,7 @@ mod tests {
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: None,
        };

@@ -197,13 +199,13 @@ mod tests {
                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
            },
            "disk_consistent_lsn":"0/16960E8",
-            "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
        }"#;

        let expected = IndexPart {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 1,
-            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
+            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
@@ -215,7 +217,7 @@ mod tests {
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: None,
        };

@@ -223,6 +225,45 @@ mod tests {
        assert_eq!(part, expected);
    }

+    #[test]
+    fn v2_indexpart_is_parsed_with_deleted_at() {
+        let example = r#"{
+            "version":2,
+            "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
+            "missing_layers":["This shouldn't fail deserialization"],
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+            "deleted_at": "2023-07-31T09:00:00.123"
+        }"#;
+
+        let expected = IndexPart {
+            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
+            version: 2,
+            timeline_layers: HashSet::new(),
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                    file_size: 25600000,
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                    // serde_json should always parse this but this might be a double with jq for
+                    // example.
+                    file_size: 9007199254741001,
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
+            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
+                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
+        };
+
+        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        assert_eq!(part, expected);
+    }
+
    #[test]
    fn empty_layers_are_parsed() {
        let empty_layers_json = r#"{
@@ -238,7 +279,7 @@ mod tests {
            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::new(),
            disk_consistent_lsn: "0/2532648".parse::<Lsn>().unwrap(),
-            metadata_bytes: [
+            metadata: TimelineMetadata::from_bytes(&[
                136, 151, 49, 208, 0, 70, 0, 4, 0, 0, 0, 0, 2, 83, 38, 72, 1, 0, 0, 0, 0, 2, 83,
                38, 32, 1, 87, 198, 240, 135, 97, 119, 45, 125, 38, 29, 155, 161, 140, 141, 255,
                210, 0, 0, 0, 0, 2, 83, 38, 72, 0, 0, 0, 0, 1, 73, 240, 192, 0, 0, 0, 0, 1, 73,
@@ -259,8 +300,8 @@ mod tests {
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0,
-            ]
-            .to_vec(),
+            ])
+            .unwrap(),
            deleted_at: None,
        };

--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -67,6 +67,8 @@ pub(super) async fn upload_timeline_layer<'a>(
            // upload. However, a nonexistent file can also be indicative of
            // something worse, like when a file is scheduled for upload before
            // it has been written to disk yet.
+            //
+            // This is tested against `test_compaction_delete_before_upload`
            info!(path = %source_path.display(), "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
            return Ok(());
        }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -4,26 +4,21 @@ pub mod delta_layer;
 mod filename;
 mod image_layer;
 mod inmemory_layer;
+mod layer;
 mod layer_desc;
-mod remote_layer;

-use crate::config::PageServerConf;
-use crate::context::RequestContext;
-use crate::repository::{Key, Value};
+use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
-use anyhow::Result;
 use bytes::Bytes;
 use enum_map::EnumMap;
 use enumset::EnumSet;
 use once_cell::sync::Lazy;
-use pageserver_api::models::LayerAccessKind;
 use pageserver_api::models::{
-    HistoricLayerInfo, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
+    LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
 };
 use std::ops::Range;
-use std::path::PathBuf;
-use std::sync::{Arc, Mutex};
+use std::sync::Mutex;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 use tracing::warn;
 use utils::history_buffer::HistoryBufferWithDropCounter;
@@ -34,14 +29,13 @@ use utils::{
    lsn::Lsn,
 };

-pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
+pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
 pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
 pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
-pub use remote_layer::RemoteLayer;

-use super::timeline::layer_manager::LayerManager;
+pub(crate) use layer::{EvictionError, Layer, ResidentLayer};

 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
@@ -76,7 +70,7 @@ pub struct ValueReconstructState {
    pub img: Option<(Lsn, Bytes)>,
 }

-/// Return value from Layer::get_page_reconstruct_data
+/// Return value from [`Layer::get_value_reconstruct_data`]
 #[derive(Clone, Copy, Debug)]
 pub enum ValueReconstructResult {
    /// Got all the data needed to reconstruct the requested page
@@ -175,41 +169,9 @@ impl LayerAccessStats {
    ///
    /// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad
    /// [`record_residence_event`]: Self::record_residence_event
-    pub(crate) fn for_loading_layer(
-        layer_map_lock_held_witness: &LayerManager,
-        status: LayerResidenceStatus,
-    ) -> Self {
+    pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
        let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
-        new.record_residence_event(
-            layer_map_lock_held_witness,
-            status,
-            LayerResidenceEventReason::LayerLoad,
-        );
-        new
-    }
-
-    /// Creates a clone of `self` and records `new_status` in the clone.
-    ///
-    /// The `new_status` is not recorded in `self`.
-    ///
-    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
-    ///
-    /// [`record_residence_event`]: Self::record_residence_event
-    pub(crate) fn clone_for_residence_change(
-        &self,
-        layer_map_lock_held_witness: &LayerManager,
-        new_status: LayerResidenceStatus,
-    ) -> LayerAccessStats {
-        let clone = {
-            let inner = self.0.lock().unwrap();
-            inner.clone()
-        };
-        let new = LayerAccessStats(Mutex::new(clone));
-        new.record_residence_event(
-            layer_map_lock_held_witness,
-            new_status,
-            LayerResidenceEventReason::ResidenceChange,
-        );
+        new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
        new
    }

@@ -229,7 +191,6 @@ impl LayerAccessStats {
    ///
    pub(crate) fn record_residence_event(
        &self,
-        _layer_map_lock_held_witness: &LayerManager,
        status: LayerResidenceStatus,
        reason: LayerResidenceEventReason,
    ) {
@@ -241,10 +202,14 @@ impl LayerAccessStats {
        });
    }

-    fn record_access(&self, access_kind: LayerAccessKind, task_kind: TaskKind) {
+    fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) {
+        if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
+            return;
+        }
+
        let this_access = LayerAccessStatFullDetails {
            when: SystemTime::now(),
-            task_kind,
+            task_kind: ctx.task_kind(),
            access_kind,
        };

@@ -252,7 +217,7 @@ impl LayerAccessStats {
        locked.iter_mut().for_each(|inner| {
            inner.first_access.get_or_insert(this_access);
            inner.count_by_access_kind[access_kind] += 1;
-            inner.task_kind_flag |= task_kind;
+            inner.task_kind_flag |= ctx.task_kind();
            inner.last_accesses.write(this_access);
        })
    }
@@ -332,148 +297,12 @@ impl LayerAccessStats {
    }
 }

-/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
-/// required by [`LayerMap`](super::layer_map::LayerMap).
-///
-/// All layers should implement a minimal `std::fmt::Debug` without tenant or
-/// timeline names, because those are known in the context of which the layers
-/// are used in (timeline).
-#[async_trait::async_trait]
-pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
-    /// Range of keys that this layer covers
-    fn get_key_range(&self) -> Range<Key>;
-
-    /// Inclusive start bound of the LSN range that this layer holds
-    /// Exclusive end bound of the LSN range that this layer holds.
-    ///
-    /// - For an open in-memory layer, this is MAX_LSN.
-    /// - For a frozen in-memory layer or a delta layer, this is a valid end bound.
-    /// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
-    fn get_lsn_range(&self) -> Range<Lsn>;
-
-    /// Does this layer only contain some data for the key-range (incremental),
-    /// or does it contain a version of every page? This is important to know
-    /// for garbage collecting old layers: an incremental layer depends on
-    /// the previous non-incremental layer.
-    fn is_incremental(&self) -> bool;
-
-    ///
-    /// Return data needed to reconstruct given page at LSN.
-    ///
-    /// It is up to the caller to collect more data from previous layer and
-    /// perform WAL redo, if necessary.
-    ///
-    /// See PageReconstructResult for possible return values. The collected data
-    /// is appended to reconstruct_data; the caller should pass an empty struct
-    /// on first call, or a struct with a cached older image of the page if one
-    /// is available. If this returns ValueReconstructResult::Continue, look up
-    /// the predecessor layer and call again with the same 'reconstruct_data' to
-    /// collect more data.
-    async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_data: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> Result<ValueReconstructResult>;
-
-    /// Dump summary of the contents of the layer to stdout
-    async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
-}
-
-/// Returned by [`PersistentLayer::iter`]
-pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;
-
-/// Returned by [`PersistentLayer::key_iter`]
-pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;
-
 /// Get a layer descriptor from a layer.
 pub trait AsLayerDesc {
    /// Get the layer descriptor.
    fn layer_desc(&self) -> &PersistentLayerDesc;
 }

-/// A Layer contains all data in a "rectangle" consisting of a range of keys and
-/// range of LSNs.
-///
-/// There are two kinds of layers, in-memory and on-disk layers. In-memory
-/// layers are used to ingest incoming WAL, and provide fast access to the
-/// recent page versions. On-disk layers are stored as files on disk, and are
-/// immutable. This trait presents the common functionality of in-memory and
-/// on-disk layers.
-///
-/// Furthermore, there are two kinds of on-disk layers: delta and image layers.
-/// A delta layer contains all modifications within a range of LSNs and keys.
-/// An image layer is a snapshot of all the data in a key-range, at a single
-/// LSN.
-pub trait PersistentLayer: Layer + AsLayerDesc {
-    /// Identify the tenant this layer belongs to
-    fn get_tenant_id(&self) -> TenantId {
-        self.layer_desc().tenant_id
-    }
-
-    /// Identify the timeline this layer belongs to
-    fn get_timeline_id(&self) -> TimelineId {
-        self.layer_desc().timeline_id
-    }
-
-    /// File name used for this layer, both in the pageserver's local filesystem
-    /// state as well as in the remote storage.
-    fn filename(&self) -> LayerFileName {
-        self.layer_desc().filename()
-    }
-
-    // Path to the layer file in the local filesystem.
-    // `None` for `RemoteLayer`.
-    fn local_path(&self) -> Option<PathBuf>;
-
-    /// Iterate through all keys and values stored in the layer
-    fn iter(&self, ctx: &RequestContext) -> Result<LayerIter<'_>>;
-
-    /// Iterate through all keys stored in the layer. Returns key, lsn and value size
-    /// It is used only for compaction and so is currently implemented only for DeltaLayer
-    fn key_iter(&self, _ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
-        panic!("Not implemented")
-    }
-
-    /// Permanently remove this layer from disk.
-    fn delete_resident_layer_file(&self) -> Result<()>;
-
-    fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
-        None
-    }
-
-    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
-        None
-    }
-
-    fn is_remote_layer(&self) -> bool {
-        false
-    }
-
-    /// Returns None if the layer file size is not known.
-    ///
-    /// Should not change over the lifetime of the layer object because
-    /// current_physical_size is computed as the som of this value.
-    fn file_size(&self) -> u64 {
-        self.layer_desc().file_size
-    }
-
-    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;
-
-    fn access_stats(&self) -> &LayerAccessStats;
-}
-
-pub fn downcast_remote_layer(
-    layer: &Arc<dyn PersistentLayer>,
-) -> Option<std::sync::Arc<RemoteLayer>> {
-    if layer.is_remote_layer() {
-        Arc::clone(layer).downcast_remote_layer()
-    } else {
-        None
-    }
-}
-
 pub mod tests {
    use super::*;

@@ -496,7 +325,6 @@ pub mod tests {
                TimelineId::from_array([0; 16]),
                value.key_range,
                value.lsn,
-                false,
                233,
            )
        }
@@ -512,19 +340,6 @@ pub mod tests {
    }
 }

-/// Helper enum to hold a PageServerConf, or a path
-///
-/// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the
-/// global config, and paths to layer files are constructed using the tenant/timeline
-/// path from the config. But in the 'pagectl' binary, we need to construct a Layer
-/// struct for a file on disk, without having a page server running, so that we have no
-/// config. In that case, we use the Path variant to hold the full path to the file on
-/// disk.
-enum PathOrConf {
-    Path(PathBuf),
-    Conf(&'static PageServerConf),
-}
-
 /// Range wrapping newtype, which uses display to render Debug.
 ///
 /// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
@@ -212,9 +212,20 @@ pub enum LayerFileName {
 }

 impl LayerFileName {
-    pub fn file_name(&self) -> String {
+    pub(crate) fn file_name(&self) -> String {
        self.to_string()
    }
+
+    /// Determines if this layer file is considered to be in future meaning we will discard these
+    /// layers during timeline initialization from the given disk_consistent_lsn.
+    pub(crate) fn is_in_future(&self, disk_consistent_lsn: Lsn) -> bool {
+        use LayerFileName::*;
+        match self {
+            Image(file_name) if file_name.lsn > disk_consistent_lsn => true,
+            Delta(file_name) if file_name.lsn_range.end > disk_consistent_lsn + 1 => true,
+            _ => false,
+        }
+    }
 }

 impl fmt::Display for LayerFileName {
@@ -263,8 +274,8 @@ impl serde::Serialize for LayerFileName {
        S: serde::Serializer,
    {
        match self {
-            Self::Image(fname) => serializer.serialize_str(&fname.to_string()),
-            Self::Delta(fname) => serializer.serialize_str(&fname.to_string()),
+            Self::Image(fname) => serializer.collect_str(fname),
+            Self::Delta(fname) => serializer.collect_str(fname),
        }
    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -31,23 +31,25 @@ use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
-    LayerAccessStats, PersistentLayer, ValueReconstructResult, ValueReconstructState,
+    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
 };
+use crate::tenant::Timeline;
 use crate::virtual_file::VirtualFile;
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use hex;
-use once_cell::sync::OnceCell;
-use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
+use pageserver_api::models::LayerAccessKind;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs::{self, File};
+use std::fs::File;
 use std::io::Write;
 use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::path::{Path, PathBuf};
+use std::sync::Arc;
+use tokio::sync::OnceCell;
 use tracing::*;

 use utils::{
@@ -57,9 +59,7 @@ use utils::{
 };

 use super::filename::ImageFileName;
-use super::{
-    AsLayerDesc, Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc,
-};
+use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer};

 ///
 /// Header stored in the beginning of the file
@@ -68,7 +68,7 @@ use super::{
 /// the 'index' starts at the block indicated by 'index_start_blk'
 ///
 #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
-struct Summary {
+pub(super) struct Summary {
    /// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC.
    magic: u16,
    format_version: u16,
@@ -87,13 +87,29 @@ struct Summary {

 impl From<&ImageLayer> for Summary {
    fn from(layer: &ImageLayer) -> Self {
+        Self::expected(
+            layer.desc.tenant_id,
+            layer.desc.timeline_id,
+            layer.desc.key_range.clone(),
+            layer.lsn,
+        )
+    }
+}
+
+impl Summary {
+    pub(super) fn expected(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        key_range: Range<Key>,
+        lsn: Lsn,
+    ) -> Self {
        Self {
            magic: IMAGE_FILE_MAGIC,
            format_version: STORAGE_FORMAT_VERSION,
-            tenant_id: layer.desc.tenant_id,
-            timeline_id: layer.desc.timeline_id,
-            key_range: layer.desc.key_range.clone(),
-            lsn: layer.lsn,
+            tenant_id,
+            timeline_id,
+            key_range,
+            lsn,

            index_start_blk: 0,
            index_root_blk: 0,
@@ -101,22 +117,14 @@ impl From<&ImageLayer> for Summary {
    }
 }

-/// ImageLayer is the in-memory data structure associated with an on-disk image
-/// file.
-///
-/// We keep an ImageLayer in memory for each file, in the LayerMap. If a layer
-/// is in "loaded" state, we have a copy of the index in memory, in 'inner'.
-/// Otherwise the struct is just a placeholder for a file that exists on disk,
-/// and it needs to be loaded before using it in queries.
+/// This is used only from `pagectl`. Within pageserver, all layers are
+/// [`crate::tenant::storage_layer::Layer`], which can hold an [`ImageLayerInner`].
 pub struct ImageLayer {
-    path_or_conf: PathOrConf,
-
+    path: PathBuf,
    pub desc: PersistentLayerDesc,
    // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
    pub lsn: Lsn,
-
    access_stats: LayerAccessStats,
-
    inner: OnceCell<ImageLayerInner>,
 }

@@ -133,11 +141,15 @@ impl std::fmt::Debug for ImageLayer {
    }
 }

+/// ImageLayer is the in-memory data structure associated with an on-disk image
+/// file.
 pub struct ImageLayerInner {
    // values copied from summary
    index_start_blk: u32,
    index_root_blk: u32,

+    lsn: Lsn,
+
    /// Reader object for reading blocks from the file.
    file: FileBlockReader<VirtualFile>,
 }
@@ -151,90 +163,23 @@ impl std::fmt::Debug for ImageLayerInner {
    }
 }

-#[async_trait::async_trait]
-impl Layer for ImageLayer {
-    /// debugging function to print out the contents of the layer
-    async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
-        println!(
-            "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
-            self.desc.tenant_id,
-            self.desc.timeline_id,
-            self.desc.key_range.start,
-            self.desc.key_range.end,
-            self.lsn,
-            self.desc.is_incremental,
-            self.desc.file_size
-        );
-
-        if !verbose {
-            return Ok(());
-        }
-
-        let inner = self.load(LayerAccessKind::Dump, ctx)?;
-        let file = &inner.file;
+impl ImageLayerInner {
+    pub(super) async fn dump(&self) -> anyhow::Result<()> {
+        let file = &self.file;
        let tree_reader =
-            DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);
+            DiskBtreeReader::<_, KEY_SIZE>::new(self.index_start_blk, self.index_root_blk, file);

-        tree_reader.dump()?;
+        tree_reader.dump().await?;

-        tree_reader.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
-            println!("key: {} offset {}", hex::encode(key), value);
-            true
-        })?;
+        tree_reader
+            .visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
+                println!("key: {} offset {}", hex::encode(key), value);
+                true
+            })
+            .await?;

        Ok(())
    }
-
-    /// Look up given page in the file
-    async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        assert!(self.desc.key_range.contains(&key));
-        assert!(lsn_range.start >= self.lsn);
-        assert!(lsn_range.end >= self.lsn);
-
-        let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
-
-        let file = &inner.file;
-        let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
-
-        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
-        key.write_to_byte_slice(&mut keybuf);
-        if let Some(offset) = tree_reader.get(&keybuf)? {
-            let blob = file.block_cursor().read_blob(offset).with_context(|| {
-                format!(
-                    "failed to read value from data file {} at offset {}",
-                    self.path().display(),
-                    offset
-                )
-            })?;
-            let value = Bytes::from(blob);
-
-            reconstruct_state.img = Some((self.lsn, value));
-            Ok(ValueReconstructResult::Complete)
-        } else {
-            Ok(ValueReconstructResult::Missing)
-        }
-    }
-
-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn get_key_range(&self) -> Range<Key> {
-        self.layer_desc().key_range.clone()
-    }
-
-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        self.layer_desc().lsn_range.clone()
-    }
-
-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn is_incremental(&self) -> bool {
-        self.layer_desc().is_incremental
-    }
 }

 /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
@@ -250,52 +195,19 @@ impl AsLayerDesc for ImageLayer {
    }
 }

-impl PersistentLayer for ImageLayer {
-    fn local_path(&self) -> Option<PathBuf> {
-        Some(self.path())
-    }
-
-    fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
-        unimplemented!();
-    }
-
-    fn delete_resident_layer_file(&self) -> Result<()> {
-        // delete underlying file
-        fs::remove_file(self.path())?;
-        Ok(())
-    }
-
-    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        let layer_file_name = self.filename().file_name();
-        let lsn_range = self.get_lsn_range();
-
-        HistoricLayerInfo::Image {
-            layer_file_name,
-            layer_file_size: self.desc.file_size,
-            lsn_start: lsn_range.start,
-            remote: false,
-            access_stats: self.access_stats.as_api_model(reset),
-        }
-    }
-
-    fn access_stats(&self) -> &LayerAccessStats {
-        &self.access_stats
-    }
-}
-
 impl ImageLayer {
-    fn path_for(
-        path_or_conf: &PathOrConf,
-        timeline_id: TimelineId,
-        tenant_id: TenantId,
-        fname: &ImageFileName,
-    ) -> PathBuf {
-        match path_or_conf {
-            PathOrConf::Path(path) => path.to_path_buf(),
-            PathOrConf::Conf(conf) => conf
-                .timeline_path(&tenant_id, &timeline_id)
-                .join(fname.to_string()),
+    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+        self.desc.dump();
+
+        if !verbose {
+            return Ok(());
        }
+
+        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
+
+        inner.dump().await?;
+
+        Ok(())
    }

    fn temp_path_for(
@@ -318,83 +230,34 @@ impl ImageLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&ImageLayerInner> {
-        self.access_stats
-            .record_access(access_kind, ctx.task_kind());
-        loop {
-            if let Some(inner) = self.inner.get() {
-                return Ok(inner);
-            }
-            self.inner
-                .get_or_try_init(|| self.load_inner())
-                .with_context(|| format!("Failed to load image layer {}", self.path().display()))?;
-        }
+    async fn load(
+        &self,
+        access_kind: LayerAccessKind,
+        ctx: &RequestContext,
+    ) -> Result<&ImageLayerInner> {
+        self.access_stats.record_access(access_kind, ctx);
+        self.inner
+            .get_or_try_init(|| self.load_inner())
+            .await
+            .with_context(|| format!("Failed to load image layer {}", self.path().display()))
    }

-    fn load_inner(&self) -> Result<ImageLayerInner> {
+    async fn load_inner(&self) -> Result<ImageLayerInner> {
        let path = self.path();

-        // Open the file if it's not open already.
-        let file = VirtualFile::open(&path)
-            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
-        let file = FileBlockReader::new(file);
-        let summary_blk = file.read_blk(0)?;
-        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
+        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None)?;

-        match &self.path_or_conf {
-            PathOrConf::Conf(_) => {
-                let mut expected_summary = Summary::from(self);
-                expected_summary.index_start_blk = actual_summary.index_start_blk;
-                expected_summary.index_root_blk = actual_summary.index_root_blk;
+        // not production code
+        let actual_filename = self.path.file_name().unwrap().to_str().unwrap().to_owned();
+        let expected_filename = self.layer_desc().filename().file_name();

-                if actual_summary != expected_summary {
-                    bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
-                }
-            }
-            PathOrConf::Path(path) => {
-                let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
-                let expected_filename = self.filename().file_name();
-
-                if actual_filename != expected_filename {
-                    println!(
-                        "warning: filename does not match what is expected from in-file summary"
-                    );
-                    println!("actual: {:?}", actual_filename);
-                    println!("expected: {:?}", expected_filename);
-                }
-            }
+        if actual_filename != expected_filename {
+            println!("warning: filename does not match what is expected from in-file summary");
+            println!("actual: {:?}", actual_filename);
+            println!("expected: {:?}", expected_filename);
        }

-        Ok(ImageLayerInner {
-            index_start_blk: actual_summary.index_start_blk,
-            index_root_blk: actual_summary.index_root_blk,
-            file,
-        })
-    }
-
-    /// Create an ImageLayer struct representing an existing file on disk
-    pub fn new(
-        conf: &'static PageServerConf,
-        timeline_id: TimelineId,
-        tenant_id: TenantId,
-        filename: &ImageFileName,
-        file_size: u64,
-        access_stats: LayerAccessStats,
-    ) -> ImageLayer {
-        ImageLayer {
-            path_or_conf: PathOrConf::Conf(conf),
-            desc: PersistentLayerDesc::new_img(
-                tenant_id,
-                timeline_id,
-                filename.key_range.clone(),
-                filename.lsn,
-                false,
-                file_size,
-            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
-            lsn: filename.lsn,
-            access_stats,
-            inner: OnceCell::new(),
-        }
+        Ok(loaded)
    }

    /// Create an ImageLayer struct representing an existing file on disk.
@@ -409,13 +272,12 @@ impl ImageLayer {
            .metadata()
            .context("get file metadata to determine size")?;
        Ok(ImageLayer {
-            path_or_conf: PathOrConf::Path(path.to_path_buf()),
+            path: path.to_path_buf(),
            desc: PersistentLayerDesc::new_img(
                summary.tenant_id,
                summary.timeline_id,
                summary.key_range,
                summary.lsn,
-                false,
                metadata.len(),
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: summary.lsn,
@@ -424,18 +286,69 @@ impl ImageLayer {
        })
    }

-    fn layer_name(&self) -> ImageFileName {
-        self.desc.image_file_name()
+    /// Path to the layer file in pageserver workdir.
+    fn path(&self) -> PathBuf {
+        self.path.clone()
+    }
+}
+
+impl ImageLayerInner {
+    pub(super) fn load(
+        path: &std::path::Path,
+        lsn: Lsn,
+        summary: Option<Summary>,
+    ) -> anyhow::Result<Self> {
+        let file = VirtualFile::open(path)
+            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
+        let file = FileBlockReader::new(file);
+        let summary_blk = file.read_blk(0)?;
+        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
+
+        if let Some(mut expected_summary) = summary {
+            // production code path
+            expected_summary.index_start_blk = actual_summary.index_start_blk;
+            expected_summary.index_root_blk = actual_summary.index_root_blk;
+
+            if actual_summary != expected_summary {
+                bail!(
+                    "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
+                    actual_summary,
+                    expected_summary
+                );
+            }
+        }
+
+        Ok(ImageLayerInner {
+            index_start_blk: actual_summary.index_start_blk,
+            index_root_blk: actual_summary.index_root_blk,
+            lsn,
+            file,
+        })
    }

-    /// Path to the layer file in pageserver workdir.
-    pub fn path(&self) -> PathBuf {
-        Self::path_for(
-            &self.path_or_conf,
-            self.desc.timeline_id,
-            self.desc.tenant_id,
-            &self.layer_name(),
-        )
+    pub(super) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        reconstruct_state: &mut ValueReconstructState,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        let file = &self.file;
+        let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
+
+        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
+        key.write_to_byte_slice(&mut keybuf);
+        if let Some(offset) = tree_reader.get(&keybuf).await? {
+            let blob = file
+                .block_cursor()
+                .read_blob(offset)
+                .await
+                .with_context(|| format!("failed to read value from offset {}", offset))?;
+            let value = Bytes::from(blob);
+
+            reconstruct_state.img = Some((self.lsn, value));
+            Ok(ValueReconstructResult::Complete)
+        } else {
+            Ok(ValueReconstructResult::Missing)
+        }
    }
 }

@@ -457,7 +370,6 @@ struct ImageLayerWriterInner {
    tenant_id: TenantId,
    key_range: Range<Key>,
    lsn: Lsn,
-    is_incremental: bool,

    blob_writer: WriteBlobWriter<VirtualFile>,
    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
@@ -473,7 +385,6 @@ impl ImageLayerWriterInner {
        tenant_id: TenantId,
        key_range: &Range<Key>,
        lsn: Lsn,
-        is_incremental: bool,
    ) -> anyhow::Result<Self> {
        // Create the file initially with a temporary filename.
        // We'll atomically rename it to the final name when we're done.
@@ -508,7 +419,6 @@ impl ImageLayerWriterInner {
            lsn,
            tree: tree_builder,
            blob_writer,
-            is_incremental,
        };

        Ok(writer)
@@ -533,7 +443,7 @@ impl ImageLayerWriterInner {
    ///
    /// Finish writing the image layer.
    ///
-    fn finish(self) -> anyhow::Result<ImageLayer> {
+    fn finish(self, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -569,40 +479,19 @@ impl ImageLayerWriterInner {
            self.timeline_id,
            self.key_range.clone(),
            self.lsn,
-            self.is_incremental, // for now, image layer ALWAYS covers the full range
            metadata.len(),
        );

        // Note: Because we open the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
-        let layer = ImageLayer {
-            path_or_conf: PathOrConf::Conf(self.conf),
-            desc,
-            lsn: self.lsn,
-            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: OnceCell::new(),
-        };

        // fsync the file
        file.sync_all()?;

-        // Rename the file to its final name
-        //
-        // Note: This overwrites any existing file. There shouldn't be any.
-        // FIXME: throw an error instead?
-        let final_path = ImageLayer::path_for(
-            &PathOrConf::Conf(self.conf),
-            self.timeline_id,
-            self.tenant_id,
-            &ImageFileName {
-                key_range: self.key_range.clone(),
-                lsn: self.lsn,
-            },
-        );
-        std::fs::rename(self.path, final_path)?;
+        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;

-        trace!("created image layer {}", layer.path().display());
+        trace!("created image layer {}", layer.local_path().display());

        Ok(layer)
    }
@@ -644,7 +533,6 @@ impl ImageLayerWriter {
        tenant_id: TenantId,
        key_range: &Range<Key>,
        lsn: Lsn,
-        is_incremental: bool,
    ) -> anyhow::Result<ImageLayerWriter> {
        Ok(Self {
            inner: Some(ImageLayerWriterInner::new(
@@ -653,7 +541,6 @@ impl ImageLayerWriter {
                tenant_id,
                key_range,
                lsn,
-                is_incremental,
            )?),
        })
    }
@@ -670,8 +557,11 @@ impl ImageLayerWriter {
    ///
    /// Finish writing the image layer.
    ///
-    pub fn finish(mut self) -> anyhow::Result<ImageLayer> {
-        self.inner.take().unwrap().finish()
+    pub(crate) fn finish(
+        mut self,
+        timeline: &Arc<Timeline>,
+    ) -> anyhow::Result<super::ResidentLayer> {
+        self.inner.take().unwrap().finish(timeline)
    }
 }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -7,15 +7,15 @@
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::repository::{Key, Value};
-use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
+use crate::tenant::Timeline;
 use crate::walrecord;
 use anyhow::{ensure, Result};
 use pageserver_api::models::InMemoryLayerInfo;
-use std::cell::RefCell;
 use std::collections::HashMap;
+use std::sync::{Arc, OnceLock};
 use tracing::*;
 use utils::{
    bin_ser::BeSer,
@@ -27,29 +27,25 @@ use utils::{
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
 use std::ops::Range;
-use std::sync::RwLock;
+use tokio::sync::RwLock;

-use super::{DeltaLayer, DeltaLayerWriter, Layer};
-
-thread_local! {
-    /// A buffer for serializing object during [`InMemoryLayer::put_value`].
-    /// This buffer is reused for each serialization to avoid additional malloc calls.
-    static SER_BUFFER: RefCell<Vec<u8>> = RefCell::new(Vec::new());
-}
+use super::{DeltaLayerWriter, ResidentLayer};

 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
    tenant_id: TenantId,
    timeline_id: TimelineId,

-    ///
    /// This layer contains all the changes from 'start_lsn'. The
    /// start is inclusive.
-    ///
    start_lsn: Lsn,

-    /// The above fields never change. The parts that do change are in 'inner',
-    /// and protected by mutex.
+    /// Frozen layers have an exclusive end LSN.
+    /// Writes are only allowed when this is `None`.
+    end_lsn: OnceLock<Lsn>,
+
+    /// The above fields never change, except for `end_lsn`, which is only set once.
+    /// All other changing parts are in `inner`, and protected by a mutex.
    inner: RwLock<InMemoryLayerInner>,
 }

@@ -57,21 +53,16 @@ impl std::fmt::Debug for InMemoryLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("InMemoryLayer")
            .field("start_lsn", &self.start_lsn)
+            .field("end_lsn", &self.end_lsn)
            .field("inner", &self.inner)
            .finish()
    }
 }

 pub struct InMemoryLayerInner {
-    /// Frozen layers have an exclusive end LSN.
-    /// Writes are only allowed when this is None
-    end_lsn: Option<Lsn>,
-
-    ///
    /// All versions of all pages in the layer are kept here.  Indexed
    /// by block number and LSN. The value is an offset into the
    /// ephemeral file where the page version is stored.
-    ///
    index: HashMap<Key, VecMap<Lsn, u64>>,

    /// The values are stored in a serialized format in this file.
@@ -82,65 +73,44 @@ pub struct InMemoryLayerInner {

 impl std::fmt::Debug for InMemoryLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("InMemoryLayerInner")
-            .field("end_lsn", &self.end_lsn)
-            .finish()
-    }
-}
-
-impl InMemoryLayerInner {
-    fn assert_writeable(&self) {
-        assert!(self.end_lsn.is_none());
+        f.debug_struct("InMemoryLayerInner").finish()
    }
 }

 impl InMemoryLayer {
-    pub fn get_timeline_id(&self) -> TimelineId {
+    pub(crate) fn get_timeline_id(&self) -> TimelineId {
        self.timeline_id
    }

-    pub fn info(&self) -> InMemoryLayerInfo {
+    pub(crate) fn info(&self) -> InMemoryLayerInfo {
        let lsn_start = self.start_lsn;
-        let lsn_end = self.inner.read().unwrap().end_lsn;

-        match lsn_end {
-            Some(lsn_end) => InMemoryLayerInfo::Frozen { lsn_start, lsn_end },
-            None => InMemoryLayerInfo::Open { lsn_start },
+        if let Some(&lsn_end) = self.end_lsn.get() {
+            InMemoryLayerInfo::Frozen { lsn_start, lsn_end }
+        } else {
+            InMemoryLayerInfo::Open { lsn_start }
        }
    }
-}

-#[async_trait::async_trait]
-impl Layer for InMemoryLayer {
-    fn get_key_range(&self) -> Range<Key> {
-        Key::MIN..Key::MAX
+    pub(crate) fn assert_writable(&self) {
+        assert!(self.end_lsn.get().is_none());
    }

-    fn get_lsn_range(&self) -> Range<Lsn> {
-        let inner = self.inner.read().unwrap();
-
-        let end_lsn = if let Some(end_lsn) = inner.end_lsn {
-            end_lsn
-        } else {
-            Lsn(u64::MAX)
-        };
-        self.start_lsn..end_lsn
+    pub(crate) fn end_lsn_or_max(&self) -> Lsn {
+        self.end_lsn.get().copied().unwrap_or(Lsn::MAX)
    }

-    fn is_incremental(&self) -> bool {
-        // in-memory layer is always considered incremental.
-        true
+    pub(crate) fn get_lsn_range(&self) -> Range<Lsn> {
+        self.start_lsn..self.end_lsn_or_max()
    }

    /// debugging function to print out the contents of the layer
-    async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
-        let inner = self.inner.read().unwrap();
+    ///
+    /// this is likely completly unused
+    pub async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
+        let inner = self.inner.read().await;

-        let end_str = inner
-            .end_lsn
-            .as_ref()
-            .map(Lsn::to_string)
-            .unwrap_or_default();
+        let end_str = self.end_lsn_or_max();

        println!(
            "----- in-memory layer for tli {} LSNs {}-{} ----",
@@ -151,12 +121,12 @@ impl Layer for InMemoryLayer {
            return Ok(());
        }

-        let mut cursor = inner.file.block_cursor();
+        let cursor = inner.file.block_cursor();
        let mut buf = Vec::new();
        for (key, vec_map) in inner.index.iter() {
            for (lsn, pos) in vec_map.as_slice() {
                let mut desc = String::new();
-                cursor.read_blob_into_buf(*pos, &mut buf)?;
+                cursor.read_blob_into_buf(*pos, &mut buf).await?;
                let val = Value::des(&buf);
                match val {
                    Ok(Value::Image(img)) => {
@@ -184,7 +154,7 @@ impl Layer for InMemoryLayer {
    }

    /// Look up given value in the layer.
-    async fn get_value_reconstruct_data(
+    pub(crate) async fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
@@ -194,15 +164,15 @@ impl Layer for InMemoryLayer {
        ensure!(lsn_range.start >= self.start_lsn);
        let mut need_image = true;

-        let inner = self.inner.read().unwrap();
+        let inner = self.inner.read().await;

-        let mut reader = inner.file.block_cursor();
+        let reader = inner.file.block_cursor();

        // Scan the page versions backwards, starting from `lsn`.
        if let Some(vec_map) = inner.index.get(&key) {
            let slice = vec_map.slice_range(lsn_range);
            for (entry_lsn, pos) in slice.iter().rev() {
-                let buf = reader.read_blob(*pos)?;
+                let buf = reader.read_blob(*pos).await?;
                let value = Value::des(&buf)?;
                match value {
                    Value::Image(img) => {
@@ -236,25 +206,19 @@ impl Layer for InMemoryLayer {

 impl std::fmt::Display for InMemoryLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let inner = self.inner.read().unwrap();
-
-        let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
+        let end_lsn = self.end_lsn_or_max();
        write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
    }
 }

 impl InMemoryLayer {
-    ///
-    /// Get layer size on the disk
-    ///
-    pub fn size(&self) -> Result<u64> {
-        let inner = self.inner.read().unwrap();
-        Ok(inner.file.size)
+    /// Get layer size.
+    pub async fn size(&self) -> Result<u64> {
+        let inner = self.inner.read().await;
+        Ok(inner.file.len())
    }

-    ///
    /// Create a new, empty, in-memory layer
-    ///
    pub fn create(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
@@ -270,8 +234,8 @@ impl InMemoryLayer {
            timeline_id,
            tenant_id,
            start_lsn,
+            end_lsn: OnceLock::new(),
            inner: RwLock::new(InMemoryLayerInner {
-                end_lsn: None,
                index: HashMap::new(),
                file,
            }),
@@ -282,19 +246,19 @@ impl InMemoryLayer {

    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-    pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
+    pub async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
-        let mut inner = self.inner.write().unwrap();
-        inner.assert_writeable();
+        let inner: &mut _ = &mut *self.inner.write().await;
+        self.assert_writable();

        let off = {
-            SER_BUFFER.with(|x| -> Result<_> {
-                let mut buf = x.borrow_mut();
-                buf.clear();
-                val.ser_into(&mut (*buf))?;
-                let off = inner.file.write_blob(&buf)?;
-                Ok(off)
-            })?
+            // Avoid doing allocations for "small" values.
+            // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
+            // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
+            let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
+            buf.clear();
+            val.ser_into(&mut buf)?;
+            inner.file.write_blob(&buf).await?
        };

        let vec_map = inner.index.entry(key).or_default();
@@ -316,11 +280,11 @@ impl InMemoryLayer {
    /// Make the layer non-writeable. Only call once.
    /// Records the end_lsn for non-dropped layers.
    /// `end_lsn` is exclusive
-    pub fn freeze(&self, end_lsn: Lsn) {
-        let mut inner = self.inner.write().unwrap();
+    pub async fn freeze(&self, end_lsn: Lsn) {
+        let inner = self.inner.write().await;

        assert!(self.start_lsn < end_lsn);
-        inner.end_lsn = Some(end_lsn);
+        self.end_lsn.set(end_lsn).expect("end_lsn set only once");

        for vec_map in inner.index.values() {
            for (lsn, _pos) in vec_map.as_slice() {
@@ -332,7 +296,7 @@ impl InMemoryLayer {
    /// Write this frozen in-memory layer to disk.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
-    pub fn write_to_disk(&self) -> Result<DeltaLayer> {
+    pub(crate) async fn write_to_disk(&self, timeline: &Arc<Timeline>) -> Result<ResidentLayer> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -342,19 +306,21 @@ impl InMemoryLayer {
        // lock, it will see that it's not writeable anymore and retry, but it
        // would have to wait until we release it. That race condition is very
        // rare though, so we just accept the potential latency hit for now.
-        let inner = self.inner.read().unwrap();
+        let inner = self.inner.read().await;
+
+        let end_lsn = *self.end_lsn.get().unwrap();

        let mut delta_layer_writer = DeltaLayerWriter::new(
            self.conf,
            self.timeline_id,
            self.tenant_id,
            Key::MIN,
-            self.start_lsn..inner.end_lsn.unwrap(),
+            self.start_lsn..end_lsn,
        )?;

        let mut buf = Vec::new();

-        let mut cursor = inner.file.block_cursor();
+        let cursor = inner.file.block_cursor();

        let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
        keys.sort_by_key(|k| k.0);
@@ -363,13 +329,13 @@ impl InMemoryLayer {
            let key = **key;
            // Write all page versions
            for (lsn, pos) in vec_map.as_slice() {
-                cursor.read_blob_into_buf(*pos, &mut buf)?;
+                cursor.read_blob_into_buf(*pos, &mut buf).await?;
                let will_init = Value::des(&buf)?.will_init();
                delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?;
            }
        }

-        let delta_layer = delta_layer_writer.finish(Key::MAX)?;
+        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline)?;
        Ok(delta_layer)
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -1,4 +1,3 @@
-use anyhow::Result;
 use core::fmt::Display;
 use std::ops::Range;
 use utils::{
@@ -6,7 +5,7 @@ use utils::{
    lsn::Lsn,
 };

-use crate::{context::RequestContext, repository::Key};
+use crate::repository::Key;

 use super::{DeltaFileName, ImageFileName, LayerFileName};

@@ -19,16 +18,17 @@ use serde::{Deserialize, Serialize};
 pub struct PersistentLayerDesc {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
+    /// Range of keys that this layer covers
    pub key_range: Range<Key>,
-    /// For image layer, this is `[lsn, lsn+1)`.
+    /// Inclusive start, exclusive end of the LSN range that this layer holds.
+    ///
+    /// - For an open in-memory layer, the end bound is MAX_LSN
+    /// - For a frozen in-memory layer or a delta layer, the end bound is a valid lsn after the
+    /// range start
+    /// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
    pub lsn_range: Range<Lsn>,
-    /// Whether this is a delta layer.
+    /// Whether this is a delta layer, and also, is this incremental.
    pub is_delta: bool,
-    /// Whether this layer only contains page images for part of the keys in the range. In the current implementation, this should
-    /// always be equal to `is_delta`. If we land the partial image layer PR someday, image layer could also be
-    /// incremental.
-    pub is_incremental: bool,
-    /// File size
    pub file_size: u64,
 }

@@ -61,7 +61,6 @@ impl PersistentLayerDesc {
            key_range,
            lsn_range: Lsn(0)..Lsn(1),
            is_delta: false,
-            is_incremental: false,
            file_size: 0,
        }
    }
@@ -71,7 +70,6 @@ impl PersistentLayerDesc {
        timeline_id: TimelineId,
        key_range: Range<Key>,
        lsn: Lsn,
-        is_incremental: bool,
        file_size: u64,
    ) -> Self {
        Self {
@@ -80,7 +78,6 @@ impl PersistentLayerDesc {
            key_range,
            lsn_range: Self::image_layer_lsn_range(lsn),
            is_delta: false,
-            is_incremental,
            file_size,
        }
    }
@@ -98,11 +95,26 @@ impl PersistentLayerDesc {
            key_range,
            lsn_range,
            is_delta: true,
-            is_incremental: true,
            file_size,
        }
    }

+    pub fn from_filename(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        filename: LayerFileName,
+        file_size: u64,
+    ) -> Self {
+        match filename {
+            LayerFileName::Image(i) => {
+                Self::new_img(tenant_id, timeline_id, i.key_range, i.lsn, file_size)
+            }
+            LayerFileName::Delta(d) => {
+                Self::new_delta(tenant_id, timeline_id, d.key_range, d.lsn_range, file_size)
+            }
+        }
+    }
+
    /// Get the LSN that the image layer covers.
    pub fn image_layer_lsn(&self) -> Lsn {
        assert!(!self.is_delta);
@@ -164,29 +176,43 @@ impl PersistentLayerDesc {
        self.tenant_id
    }

+    /// Does this layer only contain some data for the key-range (incremental),
+    /// or does it contain a version of every page? This is important to know
+    /// for garbage collecting old layers: an incremental layer depends on
+    /// the previous non-incremental layer.
    pub fn is_incremental(&self) -> bool {
-        self.is_incremental
+        self.is_delta
    }

    pub fn is_delta(&self) -> bool {
        self.is_delta
    }

-    pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
-        println!(
-            "----- layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
-            self.tenant_id,
-            self.timeline_id,
-            self.key_range.start,
-            self.key_range.end,
-            self.lsn_range.start,
-            self.lsn_range.end,
-            self.is_delta,
-            self.is_incremental,
-            self.file_size,
-        );
-
-        Ok(())
+    pub fn dump(&self) {
+        if self.is_delta {
+            println!(
+                "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} is_incremental {} size {} ----",
+                self.tenant_id,
+                self.timeline_id,
+                self.key_range.start,
+                self.key_range.end,
+                self.lsn_range.start,
+                self.lsn_range.end,
+                self.is_incremental(),
+                self.file_size,
+            );
+        } else {
+            println!(
+                "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
+                self.tenant_id,
+                self.timeline_id,
+                self.key_range.start,
+                self.key_range.end,
+                self.image_layer_lsn(),
+                self.is_incremental(),
+                self.file_size
+            );
+        }
    }

    pub fn file_size(&self) -> u64 {
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -1,262 +0,0 @@
-//! A RemoteLayer is an in-memory placeholder for a layer file that exists
-//! in remote storage.
-//!
-use crate::config::PageServerConf;
-use crate::context::RequestContext;
-use crate::repository::Key;
-use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
-use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
-use crate::tenant::timeline::layer_manager::LayerManager;
-use anyhow::{bail, Result};
-use pageserver_api::models::HistoricLayerInfo;
-use std::ops::Range;
-use std::path::PathBuf;
-use std::sync::Arc;
-
-use utils::{
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
-
-use super::filename::{DeltaFileName, ImageFileName};
-use super::{
-    AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
-    LayerKeyIter, LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
-};
-
-/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
-/// [`DeltaLayer`](super::DeltaLayer).
-///
-/// RemoteLayer might be downloaded on-demand during operations which are
-/// allowed download remote layers and during which, it gets replaced with a
-/// concrete `DeltaLayer` or `ImageLayer`.
-///
-/// See: [`crate::context::RequestContext`] for authorization to download
-pub struct RemoteLayer {
-    pub desc: PersistentLayerDesc,
-
-    pub layer_metadata: LayerFileMetadata,
-
-    access_stats: LayerAccessStats,
-
-    pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
-
-    /// Has `LayerMap::replace` failed for this (true) or not (false).
-    ///
-    /// Used together with [`ongoing_download`] semaphore in `Timeline::download_remote_layer`.
-    /// The field is used to mark a RemoteLayer permanently (until restart or ignore+load)
-    /// unprocessable, because a LayerMap::replace failed.
-    ///
-    /// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
-    /// a possible fast loop between `Timeline::get_reconstruct_data` and
-    /// `Timeline::download_remote_layer`, which also logs.
-    ///
-    /// [`ongoing_download`]: Self::ongoing_download
-    pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
-}
-
-impl std::fmt::Debug for RemoteLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("RemoteLayer")
-            .field("file_name", &self.desc.filename())
-            .field("layer_metadata", &self.layer_metadata)
-            .field("is_incremental", &self.desc.is_incremental)
-            .finish()
-    }
-}
-
-#[async_trait::async_trait]
-impl Layer for RemoteLayer {
-    async fn get_value_reconstruct_data(
-        &self,
-        _key: Key,
-        _lsn_range: Range<Lsn>,
-        _reconstruct_state: &mut ValueReconstructState,
-        _ctx: &RequestContext,
-    ) -> Result<ValueReconstructResult> {
-        bail!("layer {self} needs to be downloaded");
-    }
-
-    /// debugging function to print out the contents of the layer
-    async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
-        println!(
-            "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
-            self.desc.tenant_id,
-            self.desc.timeline_id,
-            self.desc.key_range.start,
-            self.desc.key_range.end,
-            self.desc.lsn_range.start,
-            self.desc.lsn_range.end,
-            self.desc.is_delta,
-            self.desc.is_incremental,
-            self.desc.file_size,
-        );
-
-        Ok(())
-    }
-
-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn get_key_range(&self) -> Range<Key> {
-        self.layer_desc().key_range.clone()
-    }
-
-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        self.layer_desc().lsn_range.clone()
-    }
-
-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn is_incremental(&self) -> bool {
-        self.layer_desc().is_incremental
-    }
-}
-
-/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-impl std::fmt::Display for RemoteLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.layer_desc().short_id())
-    }
-}
-
-impl AsLayerDesc for RemoteLayer {
-    fn layer_desc(&self) -> &PersistentLayerDesc {
-        &self.desc
-    }
-}
-
-impl PersistentLayer for RemoteLayer {
-    fn local_path(&self) -> Option<PathBuf> {
-        None
-    }
-
-    fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
-        bail!("cannot iterate a remote layer");
-    }
-
-    fn key_iter(&self, _ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
-        bail!("cannot iterate a remote layer");
-    }
-
-    fn delete_resident_layer_file(&self) -> Result<()> {
-        bail!("remote layer has no layer file");
-    }
-
-    fn downcast_remote_layer<'a>(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
-        Some(self)
-    }
-
-    fn is_remote_layer(&self) -> bool {
-        true
-    }
-
-    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        let layer_file_name = self.filename().file_name();
-        let lsn_range = self.get_lsn_range();
-
-        if self.desc.is_delta {
-            HistoricLayerInfo::Delta {
-                layer_file_name,
-                layer_file_size: self.layer_metadata.file_size(),
-                lsn_start: lsn_range.start,
-                lsn_end: lsn_range.end,
-                remote: true,
-                access_stats: self.access_stats.as_api_model(reset),
-            }
-        } else {
-            HistoricLayerInfo::Image {
-                layer_file_name,
-                layer_file_size: self.layer_metadata.file_size(),
-                lsn_start: lsn_range.start,
-                remote: true,
-                access_stats: self.access_stats.as_api_model(reset),
-            }
-        }
-    }
-
-    fn access_stats(&self) -> &LayerAccessStats {
-        &self.access_stats
-    }
-}
-
-impl RemoteLayer {
-    pub fn new_img(
-        tenantid: TenantId,
-        timelineid: TimelineId,
-        fname: &ImageFileName,
-        layer_metadata: &LayerFileMetadata,
-        access_stats: LayerAccessStats,
-    ) -> RemoteLayer {
-        RemoteLayer {
-            desc: PersistentLayerDesc::new_img(
-                tenantid,
-                timelineid,
-                fname.key_range.clone(),
-                fname.lsn,
-                false,
-                layer_metadata.file_size(),
-            ),
-            layer_metadata: layer_metadata.clone(),
-            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
-            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
-            access_stats,
-        }
-    }
-
-    pub fn new_delta(
-        tenantid: TenantId,
-        timelineid: TimelineId,
-        fname: &DeltaFileName,
-        layer_metadata: &LayerFileMetadata,
-        access_stats: LayerAccessStats,
-    ) -> RemoteLayer {
-        RemoteLayer {
-            desc: PersistentLayerDesc::new_delta(
-                tenantid,
-                timelineid,
-                fname.key_range.clone(),
-                fname.lsn_range.clone(),
-                layer_metadata.file_size(),
-            ),
-            layer_metadata: layer_metadata.clone(),
-            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
-            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
-            access_stats,
-        }
-    }
-
-    /// Create a Layer struct representing this layer, after it has been downloaded.
-    pub fn create_downloaded_layer(
-        &self,
-        layer_map_lock_held_witness: &LayerManager,
-        conf: &'static PageServerConf,
-        file_size: u64,
-    ) -> Arc<dyn PersistentLayer> {
-        if self.desc.is_delta {
-            let fname = self.desc.delta_file_name();
-            Arc::new(DeltaLayer::new(
-                conf,
-                self.desc.timeline_id,
-                self.desc.tenant_id,
-                &fname,
-                file_size,
-                self.access_stats.clone_for_residence_change(
-                    layer_map_lock_held_witness,
-                    LayerResidenceStatus::Resident,
-                ),
-            ))
-        } else {
-            let fname = self.desc.image_file_name();
-            Arc::new(ImageLayer::new(
-                conf,
-                self.desc.timeline_id,
-                self.desc.tenant_id,
-                &fname,
-                file_size,
-                self.access_stats.clone_for_residence_change(
-                    layer_map_lock_held_witness,
-                    LayerResidenceStatus::Resident,
-                ),
-            ))
-        }
-    }
-}
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -73,17 +73,13 @@ pub fn start_background_loops(
 ///
 async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    let wait_duration = Duration::from_secs(2);
-    info!("starting");
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
        let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
        let mut first = true;
        loop {
-            trace!("waking up");
-
            tokio::select! {
                _ = cancel.cancelled() => {
-                    info!("received cancellation request");
                    return;
                },
                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
@@ -126,15 +122,12 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                .await
                .is_ok()
            {
-                info!("received cancellation request during idling");
                break;
            }
        }
    }
    .await;
    TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
-
-    trace!("compaction loop stopped.");
 }

 ///
@@ -142,7 +135,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 ///
 async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    let wait_duration = Duration::from_secs(2);
-    info!("starting");
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
        // GC might require downloading, to find the cutoff LSN that corresponds to the
@@ -151,11 +143,8 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
        let mut first = true;
        loop {
-            trace!("waking up");
-
            tokio::select! {
                _ = cancel.cancelled() => {
-                    info!("received cancellation request");
                    return;
                },
                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
@@ -200,14 +189,12 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                .await
                .is_ok()
            {
-                info!("received cancellation request during idling");
                break;
            }
        }
    }
    .await;
    TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
-    trace!("GC loop stopped.");
 }

 async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
@@ -232,7 +219,6 @@ async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
                    }
                }
                Err(_sender_dropped_error) => {
-                    info!("Tenant dropped the state updates sender, quitting waiting for tenant and the task loop");
                    return ControlFlow::Break(());
                }
            }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -25,7 +25,7 @@ use crate::{
    InitializationOrder,
 };

-use super::Timeline;
+use super::{Timeline, TimelineResources};

 /// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
 async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
@@ -219,27 +219,13 @@ async fn delete_local_layer_files(
            }
        };

-        let r = if metadata.is_dir() {
-            // There shouldnt be any directories inside timeline dir as of current layout.
+        if metadata.is_dir() {
+            warn!(path=%entry.path().display(), "unexpected directory under timeline dir");
            tokio::fs::remove_dir(entry.path()).await
        } else {
            tokio::fs::remove_file(entry.path()).await
-        };
-
-        if let Err(e) = r {
-            if e.kind() == std::io::ErrorKind::NotFound {
-                warn!(
-                    timeline_dir=?local_timeline_directory,
-                    path=?entry.path().display(),
-                    "got not found err while removing timeline dir, proceeding anyway"
-                );
-                continue;
-            }
-            anyhow::bail!(anyhow::anyhow!(
-                "Failed to remove: {}. Error: {e}",
-                entry.path().display()
-            ));
        }
+        .with_context(|| format!("Failed to remove: {}", entry.path().display()))?;
    }

    info!("finished deleting layer files, releasing layer_removal_cs.lock()");
@@ -293,6 +279,17 @@ async fn cleanup_remaining_timeline_fs_traces(
        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
    });

+    // Make sure previous deletions are ordered before mark removal.
+    // Otherwise there is no guarantee that they reach the disk before mark deletion.
+    // So its possible for mark to reach disk first and for other deletions
+    // to be reordered later and thus missed if a crash occurs.
+    // Note that we dont need to sync after mark file is removed
+    // because we can tolerate the case when mark file reappears on startup.
+    let timeline_path = conf.timelines_path(&tenant_id);
+    crashsafe::fsync_async(timeline_path)
+        .await
+        .context("fsync_pre_mark_remove")?;
+
    // Remove delete mark
    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
        .await
@@ -359,10 +356,11 @@ impl DeleteTimelineFlow {
    // NB: If this fails half-way through, and is retried, the retry will go through
    // all the same steps again. Make sure the code here is idempotent, and don't
    // error out if some of the shutdown tasks have already been completed!
-    #[instrument(skip_all, fields(tenant_id=%tenant.tenant_id, %timeline_id))]
+    #[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_id))]
    pub async fn run(
        tenant: &Arc<Tenant>,
        timeline_id: TimelineId,
+        inplace: bool,
    ) -> Result<(), DeleteTimelineError> {
        let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;

@@ -380,7 +378,11 @@ impl DeleteTimelineFlow {
            ))?
        });

-        Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
+        if inplace {
+            Self::background(guard, tenant.conf, tenant, &timeline).await?
+        } else {
+            Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
+        }

        Ok(())
    }
@@ -398,6 +400,8 @@ impl DeleteTimelineFlow {
    }

    /// Shortcut to create Timeline in stopping state and spawn deletion task.
+    /// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`]
+    #[instrument(skip_all, fields(%timeline_id))]
    pub async fn resume_deletion(
        tenant: Arc<Tenant>,
        timeline_id: TimelineId,
@@ -412,7 +416,7 @@ impl DeleteTimelineFlow {
                timeline_id,
                local_metadata,
                None, // Ancestor is not needed for deletion.
-                remote_client,
+                TimelineResources { remote_client },
                init_order,
                // Important. We dont pass ancestor above because it can be missing.
                // Thus we need to skip the validation here.
@@ -444,11 +448,15 @@ impl DeleteTimelineFlow {
        Ok(())
    }

+    #[instrument(skip_all, fields(%timeline_id))]
    pub async fn cleanup_remaining_timeline_fs_traces(
        tenant: &Tenant,
        timeline_id: TimelineId,
    ) -> anyhow::Result<()> {
-        cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await
+        let r =
+            cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await;
+        info!("Done");
+        r
    }

    fn prepare(
@@ -494,11 +502,17 @@ impl DeleteTimelineFlow {
        // At the end of the operation we're holding the guard and need to lock timelines map
        // to remove the timeline from it.
        // Always if you have two locks that are taken in different order this can result in a deadlock.
-        let delete_lock_guard = DeletionGuard(
-            Arc::clone(&timeline.delete_progress)
-                .try_lock_owned()
-                .map_err(|_| DeleteTimelineError::AlreadyInProgress)?,
-        );
+
+        let delete_progress = Arc::clone(&timeline.delete_progress);
+        let delete_lock_guard = match delete_progress.try_lock_owned() {
+            Ok(guard) => DeletionGuard(guard),
+            Err(_) => {
+                // Unfortunately if lock fails arc is consumed.
+                return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
+                    &timeline.delete_progress,
+                )));
+            }
+        };

        timeline.set_state(TimelineState::Stopping);

@@ -553,10 +567,14 @@ impl DeleteTimelineFlow {

        remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;

-        *guard.0 = Self::Finished;
+        *guard = Self::Finished;

        Ok(())
    }
+
+    pub(crate) fn is_finished(&self) -> bool {
+        matches!(self, Self::Finished)
+    }
 }

 struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -29,7 +29,6 @@ use crate::{
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
-        storage_layer::PersistentLayer,
        timeline::EvictionError,
        LogicalSizeCalculationCause, Tenant,
    },
@@ -78,9 +77,6 @@ impl Timeline {

    #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
    async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
-        scopeguard::defer! {
-            info!("eviction task finishing");
-        }
        use crate::tenant::tasks::random_init_delay;
        {
            let policy = self.get_eviction_policy();
@@ -197,15 +193,26 @@ impl Timeline {
        // NB: all the checks can be invalidated as soon as we release the layer map lock.
        // We don't want to hold the layer map lock during eviction.
        // So, we just need to deal with this.
-        let candidates: Vec<Arc<dyn PersistentLayer>> = {
+        let candidates: Vec<_> = {
            let guard = self.layers.read().await;
            let layers = guard.layer_map();
            let mut candidates = Vec::new();
            for hist_layer in layers.iter_historic_layers() {
                let hist_layer = guard.get_from_desc(&hist_layer);
-                if hist_layer.is_remote_layer() {
-                    continue;
-                }
+
+                // guard against eviction while we inspect it; it might be that eviction_task and
+                // disk_usage_eviction_task both select the same layers to be evicted, and
+                // seemingly free up double the space. both succeeding is of no consequence.
+                let guard = match hist_layer.keep_resident().await {
+                    Ok(Some(l)) => l,
+                    Ok(None) => continue,
+                    Err(e) => {
+                        // these should not happen, but we cannot make them statically impossible right
+                        // now.
+                        tracing::warn!(layer=%hist_layer, "failed to keep the layer resident: {e:#}");
+                        continue;
+                    }
+                };

                let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| {
                    // We only use this fallback if there's an implementation error.
@@ -236,7 +243,7 @@ impl Timeline {
                    }
                };
                if no_activity_for > p.threshold {
-                    candidates.push(hist_layer)
+                    candidates.push(guard.drop_eviction_guard())
                }
            }
            candidates
@@ -255,7 +262,7 @@ impl Timeline {
        };

        let results = match self
-            .evict_layer_batch(remote_client, &candidates[..], cancel.clone())
+            .evict_layer_batch(remote_client, &candidates, cancel)
            .await
        {
            Err(pre_err) => {
@@ -266,7 +273,7 @@ impl Timeline {
            Ok(results) => results,
        };
        assert_eq!(results.len(), candidates.len());
-        for (l, result) in candidates.iter().zip(results) {
+        for result in results {
            match result {
                None => {
                    stats.skipped_for_shutdown += 1;
@@ -274,20 +281,10 @@ impl Timeline {
                Some(Ok(())) => {
                    stats.evicted += 1;
                }
-                Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
-                    stats.not_evictable += 1;
-                }
-                Some(Err(EvictionError::FileNotFound)) => {
+                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
                    // compaction/gc removed the file while we were waiting on layer_removal_cs
                    stats.not_evictable += 1;
                }
-                Some(Err(
-                    e @ EvictionError::LayerNotFound(_) | e @ EvictionError::StatFailed(_),
-                )) => {
-                    let e = utils::error::report_compact_sources(&e);
-                    warn!(layer = %l, "failed to evict layer: {e}");
-                    stats.not_evictable += 1;
-                }
            }
        }
        if stats.candidates == stats.not_evictable {
@@ -308,8 +305,13 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> ControlFlow<()> {
        let mut state = self.eviction_task_timeline_state.lock().await;
+
+        // Only do the imitate_layer accesses approximately as often as the threshold.  A little
+        // more frequently, to avoid this period racing with the threshold/period-th eviction iteration.
+        let inter_imitate_period = p.threshold.checked_sub(p.period).unwrap_or(p.threshold);
+
        match state.last_layer_access_imitation {
-            Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
+            Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
            _ => {
                self.imitate_timeline_cached_layer_accesses(cancel, ctx)
                    .await;
@@ -332,7 +334,7 @@ impl Timeline {
        };
        let mut state = tenant.eviction_task_tenant_state.lock().await;
        match state.last_layer_access_imitation {
-            Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
+            Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
            _ => {
                self.imitate_synthetic_size_calculation_worker(&tenant, ctx, cancel)
                    .await;
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -0,0 +1,199 @@
+use crate::{
+    is_temporary,
+    tenant::{
+        ephemeral_file::is_ephemeral_file,
+        remote_timeline_client::{
+            self,
+            index::{IndexPart, LayerFileMetadata},
+        },
+        storage_layer::LayerFileName,
+    },
+    METADATA_FILE_NAME,
+};
+use anyhow::Context;
+use std::{collections::HashMap, ffi::OsString, path::Path, str::FromStr};
+use utils::lsn::Lsn;
+
+/// Identified files in the timeline directory.
+pub(super) enum Discovered {
+    /// The only one we care about
+    Layer(LayerFileName, u64),
+    /// Old ephmeral files from previous launches, should be removed
+    Ephemeral(OsString),
+    /// Old temporary timeline files, unsure what these really are, should be removed
+    Temporary(OsString),
+    /// Temporary on-demand download files, should be removed
+    TemporaryDownload(OsString),
+    /// "metadata" file we persist locally and include in `index_part.json`
+    Metadata,
+    /// Backup file from previously future layers
+    IgnoredBackup,
+    /// Unrecognized, warn about these
+    Unknown(OsString),
+}
+
+/// Scans the timeline directory for interesting files.
+pub(super) fn scan_timeline_dir(path: &Path) -> anyhow::Result<Vec<Discovered>> {
+    let mut ret = Vec::new();
+
+    for direntry in std::fs::read_dir(path)? {
+        let direntry = direntry?;
+        let direntry_path = direntry.path();
+        let file_name = direntry.file_name();
+
+        let fname = file_name.to_string_lossy();
+
+        let discovered = match LayerFileName::from_str(&fname) {
+            Ok(file_name) => {
+                let file_size = direntry.metadata()?.len();
+                Discovered::Layer(file_name, file_size)
+            }
+            Err(_) => {
+                if fname == METADATA_FILE_NAME {
+                    Discovered::Metadata
+                } else if fname.ends_with(".old") {
+                    // ignore these
+                    Discovered::IgnoredBackup
+                } else if remote_timeline_client::is_temp_download_file(&direntry_path) {
+                    Discovered::TemporaryDownload(file_name)
+                } else if is_ephemeral_file(&fname) {
+                    Discovered::Ephemeral(file_name)
+                } else if is_temporary(&direntry_path) {
+                    Discovered::Temporary(file_name)
+                } else {
+                    Discovered::Unknown(file_name)
+                }
+            }
+        };
+
+        ret.push(discovered);
+    }
+
+    Ok(ret)
+}
+
+/// Decision on what to do with a layer file after considering its local and remote metadata.
+#[derive(Clone)]
+pub(super) enum Decision {
+    /// The layer is not present locally.
+    Evicted(LayerFileMetadata),
+    /// The layer is present locally, but local metadata does not match remote; we must
+    /// delete it and treat it as evicted.
+    UseRemote {
+        local: LayerFileMetadata,
+        remote: LayerFileMetadata,
+    },
+    /// The layer is present locally, and metadata matches.
+    UseLocal(LayerFileMetadata),
+    /// The layer is only known locally, it needs to be uploaded.
+    NeedsUpload(LayerFileMetadata),
+}
+
+/// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded.
+#[derive(Debug)]
+pub(super) struct FutureLayer {
+    /// The local metadata. `None` if the layer is only known through [`IndexPart`].
+    pub(super) local: Option<LayerFileMetadata>,
+}
+
+/// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
+///
+/// This function should not gain additional reasons to fail than [`FutureLayer`], consider adding
+/// the checks earlier to [`scan_timeline_dir`].
+pub(super) fn reconcile(
+    discovered: Vec<(LayerFileName, u64)>,
+    index_part: Option<&IndexPart>,
+    disk_consistent_lsn: Lsn,
+) -> Vec<(LayerFileName, Result<Decision, FutureLayer>)> {
+    use Decision::*;
+
+    // name => (local, remote)
+    type Collected = HashMap<LayerFileName, (Option<LayerFileMetadata>, Option<LayerFileMetadata>)>;
+
+    let mut discovered = discovered
+        .into_iter()
+        .map(|(name, file_size)| (name, (Some(LayerFileMetadata::new(file_size)), None)))
+        .collect::<Collected>();
+
+    // merge any index_part information, when available
+    index_part
+        .as_ref()
+        .map(|ip| ip.layer_metadata.iter())
+        .into_iter()
+        .flatten()
+        .map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
+        .for_each(|(name, metadata)| {
+            if let Some(existing) = discovered.get_mut(name) {
+                existing.1 = Some(metadata);
+            } else {
+                discovered.insert(name.to_owned(), (None, Some(metadata)));
+            }
+        });
+
+    discovered
+        .into_iter()
+        .map(|(name, (local, remote))| {
+            let decision = if name.is_in_future(disk_consistent_lsn) {
+                Err(FutureLayer { local })
+            } else {
+                Ok(match (local, remote) {
+                    (Some(local), Some(remote)) if local != remote => UseRemote { local, remote },
+                    (Some(x), Some(_)) => UseLocal(x),
+                    (None, Some(x)) => Evicted(x),
+                    (Some(x), None) => NeedsUpload(x),
+                    (None, None) => {
+                        unreachable!("there must not be any non-local non-remote files")
+                    }
+                })
+            };
+
+            (name, decision)
+        })
+        .collect::<Vec<_>>()
+}
+
+pub(super) fn cleanup(path: &Path, kind: &str) -> anyhow::Result<()> {
+    let file_name = path.file_name().expect("must be file path");
+    tracing::debug!(kind, ?file_name, "cleaning up");
+    std::fs::remove_file(path)
+        .with_context(|| format!("failed to remove {kind} at {}", path.display()))
+}
+
+pub(super) fn cleanup_local_file_for_remote(
+    path: &Path,
+    local: &LayerFileMetadata,
+    remote: &LayerFileMetadata,
+) -> anyhow::Result<()> {
+    let local_size = local.file_size();
+    let remote_size = remote.file_size();
+
+    let file_name = path.file_name().expect("must be file path");
+    tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
+    if let Err(err) = crate::tenant::timeline::rename_to_backup(path) {
+        assert!(
+            path.exists(),
+            "we would leave the local_layer without a file if this does not hold: {}",
+            path.display()
+        );
+        Err(err)
+    } else {
+        Ok(())
+    }
+}
+
+pub(super) fn cleanup_future_layer(
+    path: &Path,
+    name: &LayerFileName,
+    disk_consistent_lsn: Lsn,
+) -> anyhow::Result<()> {
+    use LayerFileName::*;
+    let kind = match name {
+        Delta(_) => "delta",
+        Image(_) => "image",
+    };
+    // future image layers are allowed to be produced always for not yet flushed to disk
+    // lsns stored in InMemoryLayer.
+    tracing::info!("found future {kind} layer {name} disk_consistent_lsn is {disk_consistent_lsn}");
+    crate::tenant::timeline::rename_to_backup(path)?;
+    Ok(())
+}
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -8,42 +8,40 @@ use utils::{

 use crate::{
    config::PageServerConf,
-    metrics::TimelineMetrics,
    tenant::{
        layer_map::{BatchedUpdates, LayerMap},
        storage_layer::{
-            AsLayerDesc, DeltaLayer, ImageLayer, InMemoryLayer, Layer, PersistentLayer,
-            PersistentLayerDesc, PersistentLayerKey, RemoteLayer,
+            AsLayerDesc, InMemoryLayer, Layer, PersistentLayerDesc, PersistentLayerKey,
+            ResidentLayer,
        },
-        timeline::compare_arced_layers,
    },
 };

 /// Provides semantic APIs to manipulate the layer map.
-pub struct LayerManager {
+pub(crate) struct LayerManager {
    layer_map: LayerMap,
-    layer_fmgr: LayerFileManager,
+    layer_fmgr: LayerFileManager<Layer>,
 }

 /// After GC, the layer map changes will not be applied immediately. Users should manually apply the changes after
 /// scheduling deletes in remote client.
-pub struct ApplyGcResultGuard<'a>(BatchedUpdates<'a>);
+pub(crate) struct ApplyGcResultGuard<'a>(BatchedUpdates<'a>);

 impl ApplyGcResultGuard<'_> {
-    pub fn flush(self) {
+    pub(crate) fn flush(self) {
        self.0.flush();
    }
 }

 impl LayerManager {
-    pub fn create() -> Self {
+    pub(crate) fn create() -> Self {
        Self {
            layer_map: LayerMap::default(),
            layer_fmgr: LayerFileManager::new(),
        }
    }

-    pub fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
+    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
        self.layer_fmgr.get_from_desc(desc)
    }

@@ -51,31 +49,16 @@ impl LayerManager {
    ///
    /// We expect users only to be able to get an immutable layer map. If users want to make modifications,
    /// they should use the below semantic APIs. This design makes us step closer to immutable storage state.
-    pub fn layer_map(&self) -> &LayerMap {
+    pub(crate) fn layer_map(&self) -> &LayerMap {
        &self.layer_map
    }

-    /// Get a mutable reference to the layer map. This function will be removed once `flush_frozen_layer`
-    /// gets a refactor.
-    pub fn layer_map_mut(&mut self) -> &mut LayerMap {
-        &mut self.layer_map
-    }
-
-    /// Replace layers in the layer file manager, used in evictions and layer downloads.
-    pub fn replace_and_verify(
-        &mut self,
-        expected: Arc<dyn PersistentLayer>,
-        new: Arc<dyn PersistentLayer>,
-    ) -> Result<()> {
-        self.layer_fmgr.replace_and_verify(expected, new)
-    }
-
    /// Called from `load_layer_map`. Initialize the layer manager with:
    /// 1. all on-disk layers
    /// 2. next open layer (with disk disk_consistent_lsn LSN)
-    pub fn initialize_local_layers(
+    pub(crate) fn initialize_local_layers(
        &mut self,
-        on_disk_layers: Vec<Arc<dyn PersistentLayer>>,
+        on_disk_layers: Vec<Layer>,
        next_open_layer_at: Lsn,
    ) {
        let mut updates = self.layer_map.batch_update();
@@ -87,28 +70,13 @@ impl LayerManager {
    }

    /// Initialize when creating a new timeline, called in `init_empty_layer_map`.
-    pub fn initialize_empty(&mut self, next_open_layer_at: Lsn) {
+    pub(crate) fn initialize_empty(&mut self, next_open_layer_at: Lsn) {
        self.layer_map.next_open_layer_at = Some(next_open_layer_at);
    }

-    pub fn initialize_remote_layers(
-        &mut self,
-        corrupted_local_layers: Vec<Arc<dyn PersistentLayer>>,
-        remote_layers: Vec<Arc<RemoteLayer>>,
-    ) {
-        let mut updates = self.layer_map.batch_update();
-        for layer in corrupted_local_layers {
-            Self::remove_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
-        }
-        for layer in remote_layers {
-            Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
-        }
-        updates.flush();
-    }
-
    /// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
    /// called within `get_layer_for_write`.
-    pub fn get_layer_for_write(
+    pub(crate) fn get_layer_for_write(
        &mut self,
        lsn: Lsn,
        last_record_lsn: Lsn,
@@ -120,10 +88,9 @@ impl LayerManager {

        ensure!(
            lsn > last_record_lsn,
-            "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})\n{}",
+            "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})",
            lsn,
            last_record_lsn,
-            std::backtrace::Backtrace::force_capture(),
        );

        // Do we have a layer open for writing already?
@@ -164,7 +131,7 @@ impl LayerManager {
    }

    /// Called from `freeze_inmem_layer`, returns true if successfully frozen.
-    pub fn try_freeze_in_memory_layer(
+    pub(crate) async fn try_freeze_in_memory_layer(
        &mut self,
        Lsn(last_record_lsn): Lsn,
        last_freeze_at: &AtomicLsn,
@@ -174,7 +141,7 @@ impl LayerManager {
        if let Some(open_layer) = &self.layer_map.open_layer {
            let open_layer_rc = Arc::clone(open_layer);
            // Does this layer need freezing?
-            open_layer.freeze(end_lsn);
+            open_layer.freeze(end_lsn).await;

            // The layer is no longer open, update the layer map to reflect this.
            // We will replace it with on-disk historics below.
@@ -186,140 +153,123 @@ impl LayerManager {
    }

    /// Add image layers to the layer map, called from `create_image_layers`.
-    pub fn track_new_image_layers(&mut self, image_layers: Vec<ImageLayer>) {
+    pub(crate) fn track_new_image_layers(&mut self, image_layers: &[ResidentLayer]) {
        let mut updates = self.layer_map.batch_update();
        for layer in image_layers {
-            Self::insert_historic_layer(Arc::new(layer), &mut updates, &mut self.layer_fmgr);
+            Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
        }
        updates.flush();
    }

    /// Flush a frozen layer and add the written delta layer to the layer map.
-    pub fn finish_flush_l0_layer(
+    pub(crate) fn finish_flush_l0_layer(
        &mut self,
-        delta_layer: Option<DeltaLayer>,
+        delta_layer: Option<&ResidentLayer>,
        frozen_layer_for_check: &Arc<InMemoryLayer>,
    ) {
-        let l = self.layer_map.frozen_layers.pop_front();
-        let mut updates = self.layer_map.batch_update();
+        let inmem = self
+            .layer_map
+            .frozen_layers
+            .pop_front()
+            .expect("there must be a inmem layer to flush");

-        // Only one thread may call this function at a time (for this
-        // timeline). If two threads tried to flush the same frozen
+        // Only one task may call this function at a time (for this
+        // timeline). If two tasks tried to flush the same frozen
        // layer to disk at the same time, that would not work.
-        assert!(compare_arced_layers(&l.unwrap(), frozen_layer_for_check));
+        assert_eq!(Arc::as_ptr(&inmem), Arc::as_ptr(frozen_layer_for_check));

-        if let Some(delta_layer) = delta_layer {
-            Self::insert_historic_layer(Arc::new(delta_layer), &mut updates, &mut self.layer_fmgr);
+        if let Some(l) = delta_layer {
+            let mut updates = self.layer_map.batch_update();
+            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
+            updates.flush();
        }
-        updates.flush();
    }

    /// Called when compaction is completed.
-    pub fn finish_compact_l0(
+    pub(crate) fn finish_compact_l0(
        &mut self,
-        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
-        compact_from: Vec<Arc<dyn PersistentLayer>>,
-        compact_to: Vec<Arc<dyn PersistentLayer>>,
-        metrics: &TimelineMetrics,
+        layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
+        compact_from: Vec<Layer>,
+        compact_to: &[ResidentLayer],
+        duplicates: &[(ResidentLayer, ResidentLayer)],
    ) -> Result<()> {
        let mut updates = self.layer_map.batch_update();
        for l in compact_to {
-            Self::insert_historic_layer(l, &mut updates, &mut self.layer_fmgr);
+            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
        }
        for l in compact_from {
            // NB: the layer file identified by descriptor `l` is guaranteed to be present
            // in the LayerFileManager because compaction kept holding `layer_removal_cs` the entire
            // time, even though we dropped `Timeline::layers` inbetween.
-            Self::delete_historic_layer(
-                layer_removal_cs.clone(),
-                l,
-                &mut updates,
-                metrics,
-                &mut self.layer_fmgr,
-            )?;
+            Self::delete_historic_layer(layer_removal_cs, l, &mut updates, &mut self.layer_fmgr)?;
+        }
+        for (old, new) in duplicates {
+            self.layer_fmgr.replace(old.as_ref(), new.as_ref().clone());
        }
        updates.flush();
        Ok(())
    }

    /// Called when garbage collect the timeline. Returns a guard that will apply the updates to the layer map.
-    pub fn finish_gc_timeline(
+    pub(crate) fn finish_gc_timeline(
        &mut self,
-        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
-        gc_layers: Vec<Arc<dyn PersistentLayer>>,
-        metrics: &TimelineMetrics,
+        layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
+        gc_layers: Vec<Layer>,
    ) -> Result<ApplyGcResultGuard> {
        let mut updates = self.layer_map.batch_update();
        for doomed_layer in gc_layers {
            Self::delete_historic_layer(
-                layer_removal_cs.clone(),
+                layer_removal_cs,
                doomed_layer,
                &mut updates,
-                metrics,
                &mut self.layer_fmgr,
-            )?; // FIXME: schedule succeeded deletions in timeline.rs `gc_timeline` instead of in batch?
+            )?;
        }
        Ok(ApplyGcResultGuard(updates))
    }

    /// Helper function to insert a layer into the layer map and file manager.
    fn insert_historic_layer(
-        layer: Arc<dyn PersistentLayer>,
+        layer: Layer,
        updates: &mut BatchedUpdates<'_>,
-        mapping: &mut LayerFileManager,
+        mapping: &mut LayerFileManager<Layer>,
    ) {
        updates.insert_historic(layer.layer_desc().clone());
        mapping.insert(layer);
    }

-    /// Helper function to remove a layer into the layer map and file manager
-    fn remove_historic_layer(
-        layer: Arc<dyn PersistentLayer>,
-        updates: &mut BatchedUpdates<'_>,
-        mapping: &mut LayerFileManager,
-    ) {
-        updates.remove_historic(layer.layer_desc().clone());
-        mapping.remove(layer);
-    }
-
    /// Removes the layer from local FS (if present) and from memory.
    /// Remote storage is not affected by this operation.
    fn delete_historic_layer(
        // we cannot remove layers otherwise, since gc and compaction will race
-        _layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
-        layer: Arc<dyn PersistentLayer>,
+        _layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
+        layer: Layer,
        updates: &mut BatchedUpdates<'_>,
-        metrics: &TimelineMetrics,
-        mapping: &mut LayerFileManager,
+        mapping: &mut LayerFileManager<Layer>,
    ) -> anyhow::Result<()> {
-        if !layer.is_remote_layer() {
-            layer.delete_resident_layer_file()?;
-            let layer_file_size = layer.file_size();
-            metrics.resident_physical_size_gauge.sub(layer_file_size);
-        }
+        let desc = layer.layer_desc();

        // TODO Removing from the bottom of the layer map is expensive.
        //      Maybe instead discard all layer map historic versions that
        //      won't be needed for page reconstruction for this timeline,
        //      and mark what we can't delete yet as deleted from the layer
        //      map index without actually rebuilding the index.
-        updates.remove_historic(layer.layer_desc().clone());
-        mapping.remove(layer);
+        updates.remove_historic(desc);
+        mapping.remove(&layer);
+        layer.garbage_collect_on_drop();

        Ok(())
    }

-    pub(crate) fn contains(&self, layer: &Arc<dyn PersistentLayer>) -> bool {
+    pub(crate) fn contains(&self, layer: &Layer) -> bool {
        self.layer_fmgr.contains(layer)
    }
 }

-pub struct LayerFileManager<T: AsLayerDesc + ?Sized = dyn PersistentLayer>(
-    HashMap<PersistentLayerKey, Arc<T>>,
-);
+pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);

-impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
-    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<T> {
+impl<T: AsLayerDesc + Clone + PartialEq + std::fmt::Debug> LayerFileManager<T> {
+    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
        // The assumption for the `expect()` is that all code maintains the following invariant:
        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
        self.0
@@ -329,14 +279,14 @@ impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
            .clone()
    }

-    pub(crate) fn insert(&mut self, layer: Arc<T>) {
+    pub(crate) fn insert(&mut self, layer: T) {
        let present = self.0.insert(layer.layer_desc().key(), layer.clone());
        if present.is_some() && cfg!(debug_assertions) {
            panic!("overwriting a layer: {:?}", layer.layer_desc())
        }
    }

-    pub(crate) fn contains(&self, layer: &Arc<T>) -> bool {
+    pub(crate) fn contains(&self, layer: &T) -> bool {
        self.0.contains_key(&layer.layer_desc().key())
    }

@@ -344,7 +294,7 @@ impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
        Self(HashMap::new())
    }

-    pub(crate) fn remove(&mut self, layer: Arc<T>) {
+    pub(crate) fn remove(&mut self, layer: &T) {
        let present = self.0.remove(&layer.layer_desc().key());
        if present.is_none() && cfg!(debug_assertions) {
            panic!(
@@ -354,38 +304,13 @@ impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
        }
    }

-    pub(crate) fn replace_and_verify(&mut self, expected: Arc<T>, new: Arc<T>) -> Result<()> {
-        let key = expected.layer_desc().key();
-        let other = new.layer_desc().key();
+    pub(crate) fn replace(&mut self, old: &T, new: T) {
+        let key = old.layer_desc().key();
+        assert_eq!(key, new.layer_desc().key());

-        let expected_l0 = LayerMap::is_l0(expected.layer_desc());
-        let new_l0 = LayerMap::is_l0(new.layer_desc());
-
-        fail::fail_point!("layermap-replace-notfound", |_| anyhow::bail!(
-            "layermap-replace-notfound"
-        ));
-
-        anyhow::ensure!(
-            key == other,
-            "expected and new layer have different keys: {key:?} != {other:?}"
-        );
-
-        anyhow::ensure!(
-            expected_l0 == new_l0,
-            "one layer is l0 while the other is not: {expected_l0} != {new_l0}"
-        );
-
-        if let Some(layer) = self.0.get_mut(&key) {
-            anyhow::ensure!(
-                compare_arced_layers(&expected, layer),
-                "another layer was found instead of expected, expected={expected:?}, new={new:?}",
-                expected = Arc::as_ptr(&expected),
-                new = Arc::as_ptr(layer),
-            );
-            *layer = new;
-            Ok(())
-        } else {
-            anyhow::bail!("layer was not found");
+        if let Some(existing) = self.0.get_mut(&key) {
+            assert_eq!(existing, old);
+            *existing = new;
        }
    }
 }
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -17,7 +17,7 @@ use crate::metrics::{
    WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
    WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
 };
-use crate::task_mgr::TaskKind;
+use crate::task_mgr::{shutdown_token, TaskKind};
 use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
@@ -31,14 +31,19 @@ use storage_broker::Streaming;
 use tokio::select;
 use tracing::*;

-use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use postgres_connection::{parse_host_port, PgConnectionConfig};
+use utils::backoff::{
+    exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
+};
 use utils::{
    id::{NodeId, TenantTimelineId},
    lsn::Lsn,
 };

-use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle};
+use super::{
+    walreceiver_connection::WalConnectionStatus, walreceiver_connection::WalReceiverError,
+    TaskEvent, TaskHandle,
+};

 /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker.
 /// Based on the updates, desides whether to start, keep or stop a WAL receiver task.
@@ -206,11 +211,14 @@ async fn subscribe_for_timeline_updates(
    id: TenantTimelineId,
 ) -> Streaming<SafekeeperTimelineInfo> {
    let mut attempt = 0;
+    let cancel = shutdown_token();
+
    loop {
        exponential_backoff(
            attempt,
            DEFAULT_BASE_BACKOFF_SECONDS,
            DEFAULT_MAX_BACKOFF_SECONDS,
+            &cancel,
        )
        .await;
        attempt += 1;
@@ -419,13 +427,19 @@ impl ConnectionManagerState {
                match res {
                    Ok(()) => Ok(()),
                    Err(e) => {
-                        use super::walreceiver_connection::ExpectedError;
-                        if e.is_expected() {
-                            info!("walreceiver connection handling ended: {e:#}");
-                            Ok(())
-                        } else {
-                            // give out an error to have task_mgr give it a really verbose logging
-                            Err(e).context("walreceiver connection handling failure")
+                        match e {
+                            WalReceiverError::SuccessfulCompletion(msg) => {
+                                info!("walreceiver connection handling ended with success: {msg}");
+                                Ok(())
+                            }
+                            WalReceiverError::ExpectedSafekeeperError(e) => {
+                                info!("walreceiver connection handling ended: {e}");
+                                Ok(())
+                            }
+                            WalReceiverError::Other(e) => {
+                                // give out an error to have task_mgr give it a really verbose logging
+                                Err(e).context("walreceiver connection handling failure")
+                            }
                        }
                    }
                }
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -8,14 +8,14 @@ use std::{
    time::{Duration, SystemTime},
 };

-use anyhow::{bail, ensure, Context};
+use anyhow::{anyhow, Context};
 use bytes::BytesMut;
 use chrono::{NaiveDateTime, Utc};
 use fail::fail_point;
 use futures::StreamExt;
 use postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow};
-use postgres_ffi::v14::xlog_utils::normalize_lsn;
 use postgres_ffi::WAL_SEGMENT_SIZE;
+use postgres_ffi::{v14::xlog_utils::normalize_lsn, waldecoder::WalDecodeError};
 use postgres_protocol::message::backend::ReplicationMessage;
 use postgres_types::PgLsn;
 use tokio::{select, sync::watch, time};
@@ -60,6 +60,50 @@ pub(super) struct WalConnectionStatus {
    pub node: NodeId,
 }

+pub(super) enum WalReceiverError {
+    /// An error of a type that does not indicate an issue, e.g. a connection closing
+    ExpectedSafekeeperError(postgres::Error),
+    /// An "error" message that carries a SUCCESSFUL_COMPLETION status code.  Carries
+    /// the message part of the original postgres error
+    SuccessfulCompletion(String),
+    /// Generic error
+    Other(anyhow::Error),
+}
+
+impl From<tokio_postgres::Error> for WalReceiverError {
+    fn from(err: tokio_postgres::Error) -> Self {
+        if let Some(dberror) = err.as_db_error().filter(|db_error| {
+            db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
+                && db_error.message().contains("ending streaming")
+        }) {
+            // Strip the outer DbError, which carries a misleading "error" severity
+            Self::SuccessfulCompletion(dberror.message().to_string())
+        } else if err.is_closed()
+            || err
+                .source()
+                .and_then(|source| source.downcast_ref::<std::io::Error>())
+                .map(is_expected_io_error)
+                .unwrap_or(false)
+        {
+            Self::ExpectedSafekeeperError(err)
+        } else {
+            Self::Other(anyhow::Error::new(err))
+        }
+    }
+}
+
+impl From<anyhow::Error> for WalReceiverError {
+    fn from(err: anyhow::Error) -> Self {
+        Self::Other(err)
+    }
+}
+
+impl From<WalDecodeError> for WalReceiverError {
+    fn from(err: WalDecodeError) -> Self {
+        Self::Other(anyhow::Error::new(err))
+    }
+}
+
 /// Open a connection to the given safekeeper and receive WAL, sending back progress
 /// messages as we go.
 pub(super) async fn handle_walreceiver_connection(
@@ -70,7 +114,7 @@ pub(super) async fn handle_walreceiver_connection(
    connect_timeout: Duration,
    ctx: RequestContext,
    node: NodeId,
-) -> anyhow::Result<()> {
+) -> Result<(), WalReceiverError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

    WALRECEIVER_STARTED_CONNECTIONS.inc();
@@ -130,11 +174,15 @@ pub(super) async fn handle_walreceiver_connection(
                connection_result = connection => match connection_result {
                    Ok(()) => debug!("Walreceiver db connection closed"),
                    Err(connection_error) => {
-                        if connection_error.is_expected() {
-                            // silence, because most likely we've already exited the outer call
-                            // with a similar error.
-                        } else {
-                            warn!("Connection aborted: {connection_error:#}")
+                        match WalReceiverError::from(connection_error) {
+                            WalReceiverError::ExpectedSafekeeperError(_) => {
+                                // silence, because most likely we've already exited the outer call
+                                // with a similar error.
+                            },
+                            WalReceiverError::SuccessfulCompletion(_) => {}
+                            WalReceiverError::Other(err) => {
+                                warn!("Connection aborted: {err:#}")
+                            }
                        }
                    }
                },
@@ -180,7 +228,7 @@ pub(super) async fn handle_walreceiver_connection(
    let mut startpoint = last_rec_lsn;

    if startpoint == Lsn(0) {
-        bail!("No previous WAL position");
+        return Err(WalReceiverError::Other(anyhow!("No previous WAL position")));
    }

    // There might be some padding after the last full record, skip it.
@@ -262,7 +310,9 @@ pub(super) async fn handle_walreceiver_connection(
                        // It is important to deal with the aligned records as lsn in getPage@LSN is
                        // aligned and can be several bytes bigger. Without this alignment we are
                        // at risk of hitting a deadlock.
-                        ensure!(lsn.is_aligned());
+                        if !lsn.is_aligned() {
+                            return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
+                        }

                        walingest
                            .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
@@ -419,51 +469,3 @@ async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem>
        Err(IdentifyError.into())
    }
 }
-
-/// Trait for avoid reporting walreceiver specific expected or "normal" or "ok" errors.
-pub(super) trait ExpectedError {
-    /// Test if this error is an ok error.
-    ///
-    /// We don't want to report connectivity problems as real errors towards connection manager because
-    /// 1. they happen frequently enough to make server logs hard to read and
-    /// 2. the connection manager can retry other safekeeper.
-    ///
-    /// If this function returns `true`, it's such an error.
-    /// The caller should log it at info level and then report to connection manager that we're done handling this connection.
-    /// Connection manager will then handle reconnections.
-    ///
-    /// If this function returns an `false` the error should be propagated and the connection manager
-    /// will log the error at ERROR level.
-    fn is_expected(&self) -> bool;
-}
-
-impl ExpectedError for postgres::Error {
-    fn is_expected(&self) -> bool {
-        self.is_closed()
-            || self
-                .source()
-                .and_then(|source| source.downcast_ref::<std::io::Error>())
-                .map(is_expected_io_error)
-                .unwrap_or(false)
-            || self
-                .as_db_error()
-                .filter(|db_error| {
-                    db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
-                        && db_error.message().contains("ending streaming")
-                })
-                .is_some()
-    }
-}
-
-impl ExpectedError for anyhow::Error {
-    fn is_expected(&self) -> bool {
-        let head = self.downcast_ref::<postgres::Error>();
-
-        let tail = self
-            .chain()
-            .filter_map(|e| e.downcast_ref::<postgres::Error>());
-
-        // check if self or any of the chained/sourced errors are expected
-        head.into_iter().chain(tail).any(|e| e.is_expected())
-    }
-}
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -1,6 +1,7 @@
 use crate::metrics::RemoteOpFileKind;

 use super::storage_layer::LayerFileName;
+use super::storage_layer::ResidentLayer;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
@@ -140,36 +141,24 @@ impl UploadQueue {
            }
        }

-        let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
-        for layer_name in &index_part.timeline_layers {
-            match index_part
-                .layer_metadata
-                .get(layer_name)
-                .map(LayerFileMetadata::from)
-            {
-                Some(layer_metadata) => {
-                    files.insert(layer_name.to_owned(), layer_metadata);
-                }
-                None => {
-                    anyhow::bail!(
-                        "No remote layer metadata found for layer {}",
-                        layer_name.file_name()
-                    );
-                }
-            }
+        let mut files = HashMap::with_capacity(index_part.layer_metadata.len());
+        for (layer_name, layer_metadata) in &index_part.layer_metadata {
+            files.insert(
+                layer_name.to_owned(),
+                LayerFileMetadata::from(layer_metadata),
+            );
        }

-        let index_part_metadata = index_part.parse_metadata()?;
        info!(
            "initializing upload queue with remote index_part.disk_consistent_lsn: {}",
-            index_part_metadata.disk_consistent_lsn()
+            index_part.metadata.disk_consistent_lsn()
        );

        let state = UploadQueueInitialized {
            latest_files: files,
            latest_files_changes_since_metadata_upload_scheduled: 0,
-            latest_metadata: index_part_metadata.clone(),
-            last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
+            latest_metadata: index_part.metadata.clone(),
+            last_uploaded_consistent_lsn: index_part.metadata.disk_consistent_lsn(),
            // what follows are boring default initializations
            task_counter: 0,
            num_inprogress_layer_uploads: 0,
@@ -222,7 +211,7 @@ pub(crate) struct Delete {
 #[derive(Debug)]
 pub(crate) enum UploadOp {
    /// Upload a layer file
-    UploadLayer(LayerFileName, LayerFileMetadata),
+    UploadLayer(ResidentLayer, LayerFileMetadata),

    /// Upload the metadata file
    UploadMetadata(IndexPart, Lsn),
@@ -237,13 +226,8 @@ pub(crate) enum UploadOp {
 impl std::fmt::Display for UploadOp {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        match self {
-            UploadOp::UploadLayer(path, metadata) => {
-                write!(
-                    f,
-                    "UploadLayer({}, size={:?})",
-                    path.file_name(),
-                    metadata.file_size()
-                )
+            UploadOp::UploadLayer(layer, metadata) => {
+                write!(f, "UploadLayer({}, size={:?})", layer, metadata.file_size())
            }
            UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
            UploadOp::Delete(delete) => write!(
--- a/Show More
+++ b/Show More