test: rename previous test, cleanup, still does not work

fix: provide better context for the other test
test: actually duplicate L1 layer in test
2026-06-02 04:50:38 +00:00 · 2023-08-30 14:19:54 +03:00 · 2023-08-30 14:19:54 +03:00 · 2023-08-30 14:19:54 +03:00 · 2023-08-30 14:19:54 +03:00 · 2023-08-30 10:31:56 +03:00
73 changed files with 6284 additions and 3501 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -737,34 +737,6 @@ jobs:
                           --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
                           --cleanup

-      # Due to a kaniko bug, we can't use cache for extensions image, thus it takes about the same amount of time as compute-node image to build (~10 min)
-      # During the transition period we need to have extensions in both places (in S3 and in compute-node image),
-      # so we won't build extension twice, but extract them from compute-node.
-      #
-      # For now we use extensions image only for new custom extensitons
-      - name: Kaniko build extensions only
-        run: |
-          # Kaniko is suposed to clean up after itself if --cleanup flag is set, but it doesn't.
-          # Despite some fixes were made in https://github.com/GoogleContainerTools/kaniko/pull/2504 (in kaniko v1.11.0),
-          # it still fails with error:
-          #   error building image: could not save file: copying file: symlink postgres /kaniko/1/usr/local/pgsql/bin/postmaster: file exists
-          #
-          # Ref https://github.com/GoogleContainerTools/kaniko/issues/1406
-          find /kaniko -maxdepth 1 -mindepth 1 -type d -regex "/kaniko/[0-9]*" -exec rm -rv {} \;
-
-          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
-                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
-                           --context . \
-                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} \
-                           --build-arg PG_VERSION=${{ matrix.version }} \
-                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
-                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
-                           --dockerfile Dockerfile.compute-node \
-                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-                           --destination neondatabase/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-                           --cleanup \
-                           --target postgres-extensions
-
      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr
@@ -780,7 +752,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.16.3
+      VM_BUILDER_VERSION: v0.17.5

    steps:
      - name: Checkout
@@ -803,7 +775,7 @@ jobs:
        run: |
          ./vm-builder \
            -enable-file-cache \
-            -enable-monitor \
+            -cgroup-uid=postgres \
            -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
            -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

@@ -886,10 +858,8 @@ jobs:
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} latest

      - name: Push images to production ECR
        if: |
@@ -900,10 +870,8 @@ jobs:
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:latest

      - name: Configure Docker Hub login
        run: |
@@ -925,65 +893,56 @@ jobs:
          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/extensions-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/extensions-v15:${{needs.tag.outputs.build-tag}} latest

      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr

-  upload-postgres-extensions-to-s3:
-    if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
-       github.event_name != 'workflow_dispatch'
-    runs-on: ${{ github.ref_name == 'release' && fromJSON('["self-hosted", "prod", "x64"]') || fromJSON('["self-hosted", "gen3", "small"]') }}
-    needs: [ tag, promote-images ]
-    strategy:
-      fail-fast: false
-      matrix:
-        version: [ v14, v15 ]
-
-    env:
-      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
-      AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
-      AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
-      S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}
-
+  build-private-extensions:
+    runs-on: [ self-hosted, gen3, small ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+      options: --init
+    needs: [ tag ]
    steps:
-      - name: Pull postgres-extensions image
+      - name: Set PR's status to pending and request a remote CI test
        run: |
-          docker pull ${EXTENSIONS_IMAGE}
+          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
+          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
+          REMOTE_REPO="${{ github.repository_owner }}/build-custom-extensions"

-      - name: Create postgres-extensions container
-        id: create-container
-        run: |
-          EID=$(docker create ${EXTENSIONS_IMAGE} true)
-          echo "EID=${EID}" >> $GITHUB_OUTPUT
+          curl -f -X POST \
+          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"state\": \"pending\",
+              \"context\": \"build-and-upload-extensions\",
+              \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
+            }"

-      - name: Extract postgres-extensions from container
-        run: |
-          rm -rf ./extensions-to-upload # Just in case
-          mkdir -p extensions-to-upload
-
-          docker cp ${{ steps.create-container.outputs.EID }}:/extensions/ ./extensions-to-upload/
-          docker cp ${{ steps.create-container.outputs.EID }}:/ext_index.json ./extensions-to-upload/
-
-      - name: Upload postgres-extensions to S3
-        run: |
-          for BUCKET in $(echo ${S3_BUCKETS:-[]} | jq --raw-output '.[]'); do
-            aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
-          done
-
-      - name: Cleanup
-        if: ${{ always() && steps.create-container.outputs.EID }}
-        run: |
-          docker rm ${{ steps.create-container.outputs.EID }} || true
+          curl -f -X POST \
+          https://api.github.com/repos/$REMOTE_REPO/actions/workflows/build_and_upload_extensions.yml/dispatches \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"ref\": \"main\",
+              \"inputs\": {
+                \"ci_job_name\": \"build-and-upload-extensions\",
+                \"commit_hash\": \"$COMMIT_SHA\",
+                \"remote_repo\": \"${{ github.repository }}\",
+                \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
+                \"remote_branch_name\": \"${{ github.ref_name }}\"
+              }
+            }"

  deploy:
    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    needs: [ upload-postgres-extensions-to-s3, promote-images, tag, regress-tests ]
+    needs: [ promote-images, tag, regress-tests ]
    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
    steps:
      - name: Fix git ownership
--- a/13
+++ b/13
@@ -1,11 +1,12 @@
-/compute_tools/ @neondatabase/control-plane
+/compute_tools/ @neondatabase/control-plane @neondatabase/compute
 /control_plane/ @neondatabase/compute @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
-/libs/postgres_ffi/ @neondatabase/compute 
-/libs/remote_storage/ @neondatabase/storage 
-/libs/safekeeper_api/ @neondatabase/safekeepers  
-/pageserver/ @neondatabase/compute @neondatabase/storage 
+/libs/postgres_ffi/ @neondatabase/compute
+/libs/remote_storage/ @neondatabase/storage
+/libs/safekeeper_api/ @neondatabase/safekeepers
+/libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
+/pageserver/ @neondatabase/compute @neondatabase/storage
 /pgxn/ @neondatabase/compute
-/proxy/ @neondatabase/control-plane 
+/proxy/ @neondatabase/proxy
 /safekeeper/ @neondatabase/safekeepers
 /vendor/ @neondatabase/compute
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -190,7 +190,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -201,7 +201,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -553,12 +553,13 @@ dependencies = [

 [[package]]
 name = "axum"
-version = "0.6.18"
+version = "0.6.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8175979259124331c1d7bf6586ee7e0da434155e4b2d48ec2c8386281d8df39"
+checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf"
 dependencies = [
 "async-trait",
 "axum-core",
+ "base64 0.21.1",
 "bitflags",
 "bytes",
 "futures-util",
@@ -573,7 +574,13 @@ dependencies = [
 "pin-project-lite",
 "rustversion",
 "serde",
+ "serde_json",
+ "serde_path_to_error",
+ "serde_urlencoded",
+ "sha1",
 "sync_wrapper",
+ "tokio",
+ "tokio-tungstenite 0.20.0",
 "tower",
 "tower-layer",
 "tower-service",
@@ -673,7 +680,7 @@ dependencies = [
 "regex",
 "rustc-hash",
 "shlex",
- "syn 2.0.16",
+ "syn 2.0.28",
 "which",
 ]

@@ -765,6 +772,19 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"

+[[package]]
+name = "cgroups-rs"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fb3af90c8d48ad5f432d8afb521b5b40c2a2fce46dd60e05912de51c47fba64"
+dependencies = [
+ "libc",
+ "log",
+ "nix 0.25.1",
+ "regex",
+ "thiserror",
+]
+
 [[package]]
 name = "chrono"
 version = "0.4.24"
@@ -849,7 +869,7 @@ dependencies = [
 "heck",
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -907,6 +927,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-compression",
+ "cfg-if",
 "chrono",
 "clap",
 "compute_api",
@@ -925,6 +946,7 @@ dependencies = [
 "tar",
 "tokio",
 "tokio-postgres",
+ "tokio-util",
 "toml_edit",
 "tracing",
 "tracing-opentelemetry",
@@ -932,6 +954,7 @@ dependencies = [
 "tracing-utils",
 "url",
 "utils",
+ "vm_monitor",
 "workspace_hack",
 "zstd",
 ]
@@ -978,7 +1001,7 @@ dependencies = [
 "comfy-table",
 "compute_api",
 "git-version",
- "nix",
+ "nix 0.26.2",
 "once_cell",
 "pageserver_api",
 "postgres",
@@ -1184,7 +1207,7 @@ dependencies = [
 "proc-macro2",
 "quote",
 "strsim",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -1195,7 +1218,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a"
 dependencies = [
 "darling_core",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -1260,7 +1283,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -1316,7 +1339,7 @@ dependencies = [
 "darling",
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -1512,7 +1535,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -1863,8 +1886,8 @@ dependencies = [
 "hyper",
 "pin-project",
 "tokio",
- "tokio-tungstenite",
- "tungstenite",
+ "tokio-tungstenite 0.18.0",
+ "tungstenite 0.18.0",
 ]

 [[package]]
@@ -1928,6 +1951,19 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "inotify"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdd168d97690d0b8c412d6b6c10360277f4d7ee495c5d0d5d5fe0854923255cc"
+dependencies = [
+ "bitflags",
+ "futures-core",
+ "inotify-sys",
+ "libc",
+ "tokio",
+]
+
 [[package]]
 name = "inotify-sys"
 version = "0.1.5"
@@ -2251,6 +2287,18 @@ dependencies = [
 "tempfile",
 ]

+[[package]]
+name = "nix"
+version = "0.25.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f346ff70e7dbfd675fe90590b92d59ef2de15a8779ae305ebcbfd3f0caf59be4"
+dependencies = [
+ "autocfg",
+ "bitflags",
+ "cfg-if",
+ "libc",
+]
+
 [[package]]
 name = "nix"
 version = "0.26.2"
@@ -2285,7 +2333,7 @@ dependencies = [
 "crossbeam-channel",
 "filetime",
 "fsevent-sys",
- "inotify",
+ "inotify 0.9.6",
 "kqueue",
 "libc",
 "mio",
@@ -2293,6 +2341,15 @@ dependencies = [
 "windows-sys 0.45.0",
 ]

+[[package]]
+name = "ntapi"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4"
+dependencies = [
+ "winapi",
+]
+
 [[package]]
 name = "num-bigint"
 version = "0.4.3"
@@ -2386,7 +2443,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -2573,7 +2630,7 @@ dependencies = [
 "hyper",
 "itertools",
 "metrics",
- "nix",
+ "nix 0.26.2",
 "num-traits",
 "num_cpus",
 "once_cell",
@@ -2596,6 +2653,7 @@ dependencies = [
 "serde_json",
 "serde_with",
 "signal-hook",
+ "smallvec",
 "storage_broker",
 "strum",
 "strum_macros",
@@ -2773,7 +2831,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -2970,7 +3028,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
 dependencies = [
 "proc-macro2",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -2981,9 +3039,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"

 [[package]]
 name = "proc-macro2"
-version = "1.0.64"
+version = "1.0.66"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78803b62cbf1f46fde80d7c0e803111524b9877184cfe7c3033659490ac7a7da"
+checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
 dependencies = [
 "unicode-ident",
 ]
@@ -3145,9 +3203,9 @@ dependencies = [

 [[package]]
 name = "quote"
-version = "1.0.27"
+version = "1.0.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500"
+checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965"
 dependencies = [
 "proc-macro2",
 ]
@@ -3569,9 +3627,9 @@ dependencies = [

 [[package]]
 name = "rustls-webpki"
-version = "0.100.1"
+version = "0.100.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6207cd5ed3d8dca7816f8f3725513a34609c0c765bf652b8c3cb4cfd87db46b"
+checksum = "e98ff011474fa39949b7e5c0428f9b4937eda7da7848bbb947786b7be0b27dab"
 dependencies = [
 "ring",
 "untrusted",
@@ -3798,22 +3856,22 @@ dependencies = [

 [[package]]
 name = "serde"
-version = "1.0.163"
+version = "1.0.183"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2"
+checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c"
 dependencies = [
 "serde_derive",
 ]

 [[package]]
 name = "serde_derive"
-version = "1.0.163"
+version = "1.0.183"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e"
+checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -3827,6 +3885,16 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "serde_path_to_error"
+version = "0.1.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4beec8bce849d58d06238cb50db2e1c417cfeafa4c63f692b15c82b7c80f8335"
+dependencies = [
+ "itoa",
+ "serde",
+]
+
 [[package]]
 name = "serde_spanned"
 version = "0.6.2"
@@ -3873,7 +3941,7 @@ dependencies = [
 "darling",
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -3972,9 +4040,9 @@ dependencies = [

 [[package]]
 name = "smallvec"
-version = "1.10.0"
+version = "1.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
+checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"

 [[package]]
 name = "socket2"
@@ -4111,9 +4179,9 @@ dependencies = [

 [[package]]
 name = "syn"
-version = "2.0.16"
+version = "2.0.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6f671d4b5ffdb8eadec19c0ae67fe2639df8684bd7bc4b83d986b8db549cf01"
+checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -4139,14 +4207,29 @@ dependencies = [
 ]

 [[package]]
-name = "tar"
-version = "0.4.38"
+name = "sysinfo"
+version = "0.29.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b55807c0344e1e6c04d7c965f5289c39a8d94ae23ed5c0b57aabac549f871c6"
+checksum = "165d6d8539689e3d3bc8b98ac59541e1f21c7de7c85d60dc80e43ae0ed2113db"
+dependencies = [
+ "cfg-if",
+ "core-foundation-sys",
+ "libc",
+ "ntapi",
+ "once_cell",
+ "rayon",
+ "winapi",
+]
+
+[[package]]
+name = "tar"
+version = "0.4.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b16afcea1f22891c49a00c751c7b63b2233284064f11a200fc624137c51e2ddb"
 dependencies = [
 "filetime",
 "libc",
- "xattr 0.2.3",
+ "xattr",
 ]

 [[package]]
@@ -4228,7 +4311,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -4343,7 +4426,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -4437,7 +4520,7 @@ dependencies = [
 "redox_syscall 0.3.5",
 "tokio",
 "tokio-stream",
- "xattr 1.0.0",
+ "xattr",
 ]

 [[package]]
@@ -4449,7 +4532,19 @@ dependencies = [
 "futures-util",
 "log",
 "tokio",
- "tungstenite",
+ "tungstenite 0.18.0",
+]
+
+[[package]]
+name = "tokio-tungstenite"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b2dbec703c26b00d74844519606ef15d09a7d6857860f84ad223dec002ddea2"
+dependencies = [
+ "futures-util",
+ "log",
+ "tokio",
+ "tungstenite 0.20.0",
 ]

 [[package]]
@@ -4641,7 +4736,7 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -4770,6 +4865,25 @@ dependencies = [
 "utf-8",
 ]

+[[package]]
+name = "tungstenite"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e862a1c4128df0112ab625f55cd5c934bcb4312ba80b39ae4b4835a3fd58e649"
+dependencies = [
+ "byteorder",
+ "bytes",
+ "data-encoding",
+ "http",
+ "httparse",
+ "log",
+ "rand",
+ "sha1",
+ "thiserror",
+ "url",
+ "utf-8",
+]
+
 [[package]]
 name = "typenum"
 version = "1.16.0"
@@ -4897,7 +5011,7 @@ dependencies = [
 "hyper",
 "jsonwebtoken",
 "metrics",
- "nix",
+ "nix 0.26.2",
 "once_cell",
 "pin-project-lite",
 "pq_proto",
@@ -4915,6 +5029,7 @@ dependencies = [
 "thiserror",
 "tokio",
 "tokio-stream",
+ "tokio-util",
 "tracing",
 "tracing-error",
 "tracing-subscriber",
@@ -4951,6 +5066,28 @@ version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"

+[[package]]
+name = "vm_monitor"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "axum",
+ "cgroups-rs",
+ "clap",
+ "futures",
+ "inotify 0.10.2",
+ "serde",
+ "serde_json",
+ "sysinfo",
+ "tokio",
+ "tokio-postgres",
+ "tokio-stream",
+ "tokio-util",
+ "tracing",
+ "tracing-subscriber",
+ "workspace_hack",
+]
+
 [[package]]
 name = "vsimd"
 version = "0.8.0"
@@ -5021,7 +5158,7 @@ dependencies = [
 "once_cell",
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 "wasm-bindgen-shared",
 ]

@@ -5055,7 +5192,7 @@ checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.16",
+ "syn 2.0.28",
 "wasm-bindgen-backend",
 "wasm-bindgen-shared",
 ]
@@ -5340,12 +5477,14 @@ name = "workspace_hack"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "axum",
 "bytes",
 "cc",
 "chrono",
 "clap",
 "clap_builder",
 "crossbeam-utils",
+ "digest",
 "either",
 "fail",
 "futures",
@@ -5354,6 +5493,7 @@ dependencies = [
 "futures-executor",
 "futures-sink",
 "futures-util",
+ "hyper",
 "itertools",
 "libc",
 "log",
@@ -5372,9 +5512,10 @@ dependencies = [
 "scopeguard",
 "serde",
 "serde_json",
+ "smallvec",
 "socket2 0.4.9",
 "syn 1.0.109",
- "syn 2.0.16",
+ "syn 2.0.28",
 "tokio",
 "tokio-rustls 0.23.4",
 "tokio-util",
@@ -5383,7 +5524,6 @@ dependencies = [
 "tower",
 "tracing",
 "tracing-core",
- "tracing-subscriber",
 "url",
 ]

@@ -5404,15 +5544,6 @@ dependencies = [
 "time",
 ]

-[[package]]
-name = "xattr"
-version = "0.2.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6d1526bbe5aaeb5eb06885f4d987bcdfa5e23187055de9b83fe00156a821fabc"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "xattr"
 version = "1.0.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,6 +23,7 @@ members = [
    "libs/remote_storage",
    "libs/tracing-utils",
    "libs/postgres_ffi/wal_craft",
+    "libs/vm_monitor",
 ]

 [workspace.package]
@@ -41,12 +42,14 @@ aws-sdk-s3 = "0.27"
 aws-smithy-http = "0.55"
 aws-credential-types = "0.55"
 aws-types = "0.55"
+axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
 bindgen = "0.65"
 bstr = "1.0"
 byteorder = "1.4"
 bytes = "1.0"
+cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["derive"] }
 close_fds = "0.3.2"
@@ -74,6 +77,7 @@ humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
 hyper-tungstenite = "0.9"
+inotify = "0.10.2"
 itertools = "0.10"
 jsonwebtoken = "8"
 libc = "0.2"
@@ -105,12 +109,14 @@ rustls = "0.20"
 rustls-pemfile = "1"
 rustls-split = "0.3"
 scopeguard = "1.1"
+sysinfo = "0.29.2"
 sentry = { version = "0.30", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_with = "2.0"
 sha2 = "0.10.2"
 signal-hook = "0.3"
+smallvec = "1.11"
 socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
@@ -133,7 +139,7 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.19.0"
-tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] }
+tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 url = "2.2"
 uuid = { version = "1.2", features = ["v4", "serde"] }
 walkdir = "2.3.2"
@@ -169,6 +175,7 @@ storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main br
 tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
+vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }

 ## Common library dependency
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -764,29 +764,6 @@ RUN rm -r /usr/local/pgsql/include
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a

-#########################################################################################
-#
-# Extenstion only
-#
-#########################################################################################
-FROM python:3.9-slim-bullseye AS generate-ext-index
-ARG PG_VERSION
-ARG BUILD_TAG
-RUN apt update && apt install -y zstd
-
-# copy the control files here
-COPY --from=kq-imcx-pg-build /extensions/ /extensions/
-COPY --from=pg-anon-pg-build /extensions/ /extensions/
-COPY --from=postgis-build /extensions/ /extensions/
-COPY scripts/combine_control_files.py ./combine_control_files.py
-RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG} --public_extensions="anon,postgis"
-
-FROM scratch AS postgres-extensions
-# After the transition this layer will include all extensitons.
-# As for now, it's only a couple for testing purposses
-COPY --from=generate-ext-index /extensions/*.tar.zst /extensions/
-COPY --from=generate-ext-index /ext_index.json /ext_index.json
-
 #########################################################################################
 #
 # Final layer
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -8,6 +8,7 @@ license.workspace = true
 anyhow.workspace = true
 async-compression.workspace = true
 chrono.workspace = true
+cfg-if.workspace = true
 clap.workspace = true
 flate2.workspace = true
 futures.workspace = true
@@ -23,6 +24,7 @@ tar.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
+tokio-util.workspace = true
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
@@ -34,4 +36,5 @@ utils.workspace = true
 workspace_hack.workspace = true
 toml_edit.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
+vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.12.4"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -35,7 +35,6 @@
 //!
 use std::collections::HashMap;
 use std::fs::File;
-use std::panic;
 use std::path::Path;
 use std::process::exit;
 use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
@@ -271,6 +270,55 @@ fn main() -> Result<()> {
        }
    };

+    // Start the vm-monitor if directed to. The vm-monitor only runs on linux
+    // because it requires cgroups.
+    cfg_if::cfg_if! {
+        if #[cfg(target_os = "linux")] {
+            use std::env;
+            use tokio_util::sync::CancellationToken;
+            use tracing::warn;
+            let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
+            let cgroup = matches.get_one::<String>("filecache-connstr");
+            let file_cache_connstr = matches.get_one::<String>("cgroup");
+
+            // Only make a runtime if we need to.
+            // Note: it seems like you can make a runtime in an inner scope and
+            // if you start a task in it it won't be dropped. However, make it
+            // in the outermost scope just to be safe.
+            let rt = match (env::var_os("AUTOSCALING"), vm_monitor_addr) {
+                (None, None) => None,
+                (None, Some(_)) => {
+                    warn!("--vm-monitor-addr option set but AUTOSCALING env var not present");
+                    None
+                }
+                (Some(_), None) => {
+                    panic!("AUTOSCALING env var present but --vm-monitor-addr option not set")
+                }
+                (Some(_), Some(_)) => Some(
+                    tokio::runtime::Builder::new_multi_thread()
+                        .worker_threads(4)
+                        .enable_all()
+                        .build()
+                        .expect("failed to create tokio runtime for monitor"),
+                ),
+            };
+
+            // This token is used internally by the monitor to clean up all threads
+            let token = CancellationToken::new();
+
+            let vm_monitor = &rt.as_ref().map(|rt| {
+                rt.spawn(vm_monitor::start(
+                    Box::leak(Box::new(vm_monitor::Args {
+                        cgroup: cgroup.cloned(),
+                        pgconnstr: file_cache_connstr.cloned(),
+                        addr: vm_monitor_addr.cloned().unwrap(),
+                    })),
+                    token.clone(),
+                ))
+            });
+        }
+    }
+
    // Wait for the child Postgres process forever. In this state Ctrl+C will
    // propagate to Postgres and it will be shut down as well.
    if let Some(mut pg) = pg {
@@ -284,6 +332,24 @@ fn main() -> Result<()> {
        exit_code = ecode.code()
    }

+    // Terminate the vm_monitor so it releases the file watcher on
+    // /sys/fs/cgroup/neon-postgres.
+    // Note: the vm-monitor only runs on linux because it requires cgroups.
+    cfg_if::cfg_if! {
+        if #[cfg(target_os = "linux")] {
+            if let Some(handle) = vm_monitor {
+                // Kills all threads spawned by the monitor
+                token.cancel();
+                // Kills the actual task running the monitor
+                handle.abort();
+
+                // If handle is some, rt must have been used to produce it, and
+                // hence is also some
+                rt.unwrap().shutdown_timeout(Duration::from_secs(2));
+            }
+        }
+    }
+
    // Maybe sync safekeepers again, to speed up next startup
    let compute_state = compute.state.lock().unwrap().clone();
    let pspec = compute_state.pspec.as_ref().expect("spec must be set");
@@ -393,6 +459,29 @@ fn cli() -> clap::Command {
                .long("remote-ext-config")
                .value_name("REMOTE_EXT_CONFIG"),
        )
+        // TODO(fprasx): we currently have default arguments because the cloud PR
+        // to pass them in hasn't been merged yet. We should get rid of them once
+        // the PR is merged.
+        .arg(
+            Arg::new("vm-monitor-addr")
+                .long("vm-monitor-addr")
+                .default_value("0.0.0.0:10301")
+                .value_name("VM_MONITOR_ADDR"),
+        )
+        .arg(
+            Arg::new("cgroup")
+                .long("cgroup")
+                .default_value("neon-postgres")
+                .value_name("CGROUP"),
+        )
+        .arg(
+            Arg::new("filecache-connstr")
+                .long("filecache-connstr")
+                .default_value(
+                    "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable",
+                )
+                .value_name("FILECACHE_CONNSTR"),
+        )
 }

 #[test]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,4 +1,5 @@
 use std::collections::HashMap;
+use std::env;
 use std::fs;
 use std::io::BufRead;
 use std::os::unix::fs::PermissionsExt;
@@ -175,6 +176,27 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
    }
 }

+/// If we are a VM, returns a [`Command`] that will run in the `neon-postgres`
+/// cgroup. Otherwise returns the default `Command::new(cmd)`
+///
+/// This function should be used to start postgres, as it will start it in the
+/// neon-postgres cgroup if we are a VM. This allows autoscaling to control
+/// postgres' resource usage. The cgroup will exist in VMs because vm-builder
+/// creates it during the sysinit phase of its inittab.
+fn maybe_cgexec(cmd: &str) -> Command {
+    // The cplane sets this env var for autoscaling computes.
+    // use `var_os` so we don't have to worry about the variable being valid
+    // unicode. Should never be an concern . . . but just in case
+    if env::var_os("AUTOSCALING").is_some() {
+        let mut command = Command::new("cgexec");
+        command.args(["-g", "memory:neon-postgres"]);
+        command.arg(cmd);
+        command
+    } else {
+        Command::new(cmd)
+    }
+}
+
 /// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
 /// that we give to customers
 fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
@@ -451,7 +473,7 @@ impl ComputeNode {
    pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
        let start_time = Utc::now();

-        let sync_handle = Command::new(&self.pgbin)
+        let sync_handle = maybe_cgexec(&self.pgbin)
            .args(["--sync-safekeepers"])
            .env("PGDATA", &self.pgdata) // we cannot use -D in this mode
            .envs(if let Some(storage_auth_token) = &storage_auth_token {
@@ -586,7 +608,7 @@ impl ComputeNode {

        // Start postgres
        info!("starting postgres");
-        let mut pg = Command::new(&self.pgbin)
+        let mut pg = maybe_cgexec(&self.pgbin)
            .args(["-D", pgdata])
            .spawn()
            .expect("cannot start postgres process");
@@ -614,7 +636,7 @@ impl ComputeNode {
        let pgdata_path = Path::new(&self.pgdata);

        // Run postgres as a child process.
-        let mut pg = Command::new(&self.pgbin)
+        let mut pg = maybe_cgexec(&self.pgbin)
            .args(["-D", &self.pgdata])
            .envs(if let Some(storage_auth_token) = &storage_auth_token {
                vec![("NEON_AUTH_TOKEN", storage_auth_token)]
--- a/deny.toml
+++ b/deny.toml
@@ -4,7 +4,12 @@
 # to your expectations and requirements.

 # Root options
-targets = []
+targets = [
+    { triple = "x86_64-unknown-linux-gnu" },
+    { triple = "aarch64-unknown-linux-gnu" },
+    { triple = "aarch64-apple-darwin" },
+    { triple = "x86_64-apple-darwin" },
+]
 all-features = false
 no-default-features = false
 feature-depth = 1
@@ -18,7 +23,7 @@ vulnerability = "deny"
 unmaintained = "warn"
 yanked = "warn"
 notice = "warn"
-ignore = []
+ignore = ["RUSTSEC-2023-0052"]

 # This section is considered when running `cargo deny check licenses`
 # More documentation for the licenses section can be found here:
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -26,6 +26,7 @@ serde_json.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
+tokio-util.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
--- a/libs/utils/src/backoff.rs
+++ b/libs/utils/src/backoff.rs
@@ -1,18 +1,31 @@
 use std::fmt::{Debug, Display};

 use futures::Future;
+use tokio_util::sync::CancellationToken;

 pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
 pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;

-pub async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
+pub async fn exponential_backoff(
+    n: u32,
+    base_increment: f64,
+    max_seconds: f64,
+    cancel: &CancellationToken,
+) {
    let backoff_duration_seconds =
        exponential_backoff_duration_seconds(n, base_increment, max_seconds);
    if backoff_duration_seconds > 0.0 {
        tracing::info!(
            "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
        );
-        tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
+
+        drop(
+            tokio::time::timeout(
+                std::time::Duration::from_secs_f64(backoff_duration_seconds),
+                cancel.cancelled(),
+            )
+            .await,
+        )
    }
 }

@@ -24,28 +37,57 @@ pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_sec
    }
 }

+/// Configure cancellation for a retried operation: when to cancel (the token), and
+/// what kind of error to return on cancellation
+pub struct Cancel<E, CF>
+where
+    E: Display + Debug + 'static,
+    CF: Fn() -> E,
+{
+    token: CancellationToken,
+    on_cancel: CF,
+}
+
+impl<E, CF> Cancel<E, CF>
+where
+    E: Display + Debug + 'static,
+    CF: Fn() -> E,
+{
+    pub fn new(token: CancellationToken, on_cancel: CF) -> Self {
+        Self { token, on_cancel }
+    }
+}
+
 /// retries passed operation until one of the following conditions are met:
 /// Encountered error is considered as permanent (non-retryable)
 /// Retries have been exhausted.
 /// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
 /// When attempts cross `warn_threshold` function starts to emit log warnings.
 /// `description` argument is added to log messages. Its value should identify the `op` is doing
-pub async fn retry<T, O, F, E>(
+/// `cancel` argument is required: any time we are looping on retry, we should be using a CancellationToken
+/// to drop out promptly on shutdown.
+pub async fn retry<T, O, F, E, CF>(
    mut op: O,
    is_permanent: impl Fn(&E) -> bool,
    warn_threshold: u32,
    max_retries: u32,
    description: &str,
+    cancel: Cancel<E, CF>,
 ) -> Result<T, E>
 where
    // Not std::error::Error because anyhow::Error doesnt implement it.
    // For context see https://github.com/dtolnay/anyhow/issues/63
-    E: Display + Debug,
+    E: Display + Debug + 'static,
    O: FnMut() -> F,
    F: Future<Output = Result<T, E>>,
+    CF: Fn() -> E,
 {
    let mut attempts = 0;
    loop {
+        if cancel.token.is_cancelled() {
+            return Err((cancel.on_cancel)());
+        }
+
        let result = op().await;
        match result {
            Ok(_) => {
@@ -80,6 +122,7 @@ where
            attempts,
            DEFAULT_BASE_BACKOFF_SECONDS,
            DEFAULT_MAX_BACKOFF_SECONDS,
+            &cancel.token,
        )
        .await;
        attempts += 1;
@@ -132,6 +175,7 @@ mod tests {
            1,
            1,
            "work",
+            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
        )
        .await;

@@ -157,6 +201,7 @@ mod tests {
            2,
            2,
            "work",
+            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
        )
        .await
        .unwrap();
@@ -179,6 +224,7 @@ mod tests {
            2,
            2,
            "work",
+            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
        )
        .await
        .unwrap_err();
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -68,6 +68,8 @@ pub mod completion;
 /// Reporting utilities
 pub mod error;

+pub mod sync;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/sync.rs
+++ b/libs/utils/src/sync.rs
@@ -0,0 +1 @@
+pub mod heavier_once_cell;
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -0,0 +1,306 @@
+use std::sync::{Arc, Mutex, MutexGuard};
+use tokio::sync::Semaphore;
+
+/// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
+/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
+/// for the duration of initialization.
+///
+/// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
+///
+/// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
+pub struct OnceCell<T> {
+    inner: Mutex<Inner<T>>,
+}
+
+impl<T> Default for OnceCell<T> {
+    /// Create new uninitialized [`OnceCell`].
+    fn default() -> Self {
+        Self {
+            inner: Default::default(),
+        }
+    }
+}
+
+/// Semaphore is the current state:
+/// - open semaphore means the value is `None`, not yet initialized
+/// - closed semaphore means the value has been initialized
+#[derive(Debug)]
+struct Inner<T> {
+    init_semaphore: Arc<Semaphore>,
+    value: Option<T>,
+}
+
+impl<T> Default for Inner<T> {
+    fn default() -> Self {
+        Self {
+            init_semaphore: Arc::new(Semaphore::new(1)),
+            value: None,
+        }
+    }
+}
+
+impl<T> OnceCell<T> {
+    /// Creates an already initialized `OnceCell` with the given value.
+    pub fn new(value: T) -> Self {
+        let sem = Semaphore::new(1);
+        sem.close();
+        Self {
+            inner: Mutex::new(Inner {
+                init_semaphore: Arc::new(sem),
+                value: Some(value),
+            }),
+        }
+    }
+
+    /// Returns a guard to an existing initialized value, or uniquely initializes the value before
+    /// returning the guard.
+    ///
+    /// Initializing might wait on any existing [`Guard::take_and_deinit`] deinitialization.
+    ///
+    /// Initialization is panic-safe and cancellation-safe.
+    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
+    where
+        F: FnOnce() -> Fut,
+        Fut: std::future::Future<Output = Result<T, E>>,
+    {
+        let sem = {
+            let guard = self.inner.lock().unwrap();
+            if guard.value.is_some() {
+                return Ok(Guard(guard));
+            }
+            guard.init_semaphore.clone()
+        };
+
+        let permit = sem.acquire_owned().await;
+        if permit.is_err() {
+            let guard = self.inner.lock().unwrap();
+            assert!(
+                guard.value.is_some(),
+                "semaphore got closed, must be initialized"
+            );
+            return Ok(Guard(guard));
+        } else {
+            // now we try
+            let value = factory().await?;
+
+            let mut guard = self.inner.lock().unwrap();
+            assert!(
+                guard.value.is_none(),
+                "we won permit, must not be initialized"
+            );
+            guard.value = Some(value);
+            guard.init_semaphore.close();
+            Ok(Guard(guard))
+        }
+    }
+
+    /// Returns a guard to an existing initialized value, if any.
+    pub fn get(&self) -> Option<Guard<'_, T>> {
+        let guard = self.inner.lock().unwrap();
+        if guard.value.is_some() {
+            Some(Guard(guard))
+        } else {
+            None
+        }
+    }
+}
+
+/// Uninteresting guard object to allow short-lived access to inspect or clone the held,
+/// initialized value.
+#[derive(Debug)]
+pub struct Guard<'a, T>(MutexGuard<'a, Inner<T>>);
+
+impl<T> std::ops::Deref for Guard<'_, T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.0
+            .value
+            .as_ref()
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+impl<T> std::ops::DerefMut for Guard<'_, T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.0
+            .value
+            .as_mut()
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+impl<'a, T> Guard<'a, T> {
+    /// Take the current value, and a new permit for it's deinitialization.
+    ///
+    /// The permit will be on a semaphore part of the new internal value, and any following
+    /// [`OnceCell::get_or_init`] will wait on it to complete.
+    pub fn take_and_deinit(&mut self) -> (T, tokio::sync::OwnedSemaphorePermit) {
+        let mut swapped = Inner::default();
+        let permit = swapped
+            .init_semaphore
+            .clone()
+            .try_acquire_owned()
+            .expect("we just created this");
+        std::mem::swap(&mut *self.0, &mut swapped);
+        swapped
+            .value
+            .map(|v| (v, permit))
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::{
+        convert::Infallible,
+        sync::atomic::{AtomicUsize, Ordering},
+        time::Duration,
+    };
+
+    #[tokio::test]
+    async fn many_initializers() {
+        #[derive(Default, Debug)]
+        struct Counters {
+            factory_got_to_run: AtomicUsize,
+            future_polled: AtomicUsize,
+            winners: AtomicUsize,
+        }
+
+        let initializers = 100;
+
+        let cell = Arc::new(OnceCell::default());
+        let counters = Arc::new(Counters::default());
+        let barrier = Arc::new(tokio::sync::Barrier::new(initializers + 1));
+
+        let mut js = tokio::task::JoinSet::new();
+        for i in 0..initializers {
+            js.spawn({
+                let cell = cell.clone();
+                let counters = counters.clone();
+                let barrier = barrier.clone();
+
+                async move {
+                    barrier.wait().await;
+                    let won = {
+                        let g = cell
+                            .get_or_init(|| {
+                                counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
+                                async {
+                                    counters.future_polled.fetch_add(1, Ordering::Relaxed);
+                                    Ok::<_, Infallible>(i)
+                                }
+                            })
+                            .await
+                            .unwrap();
+
+                        *g == i
+                    };
+
+                    if won {
+                        counters.winners.fetch_add(1, Ordering::Relaxed);
+                    }
+                }
+            });
+        }
+
+        barrier.wait().await;
+
+        while let Some(next) = js.join_next().await {
+            next.expect("no panics expected");
+        }
+
+        let mut counters = Arc::try_unwrap(counters).unwrap();
+
+        assert_eq!(*counters.factory_got_to_run.get_mut(), 1);
+        assert_eq!(*counters.future_polled.get_mut(), 1);
+        assert_eq!(*counters.winners.get_mut(), 1);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn reinit_waits_for_deinit() {
+        // with he tokio::time paused, we will "sleep" for 1s while holding the reinitialization
+        let sleep_for = Duration::from_secs(1);
+        let initial = 42;
+        let reinit = 1;
+        let cell = Arc::new(OnceCell::new(initial));
+
+        let deinitialization_started = Arc::new(tokio::sync::Barrier::new(2));
+
+        let jh = tokio::spawn({
+            let cell = cell.clone();
+            let deinitialization_started = deinitialization_started.clone();
+            async move {
+                let (answer, _permit) = cell.get().expect("initialized to value").take_and_deinit();
+                assert_eq!(answer, initial);
+
+                deinitialization_started.wait().await;
+                tokio::time::sleep(sleep_for).await;
+            }
+        });
+
+        deinitialization_started.wait().await;
+
+        let started_at = tokio::time::Instant::now();
+        cell.get_or_init(|| async { Ok::<_, Infallible>(reinit) })
+            .await
+            .unwrap();
+
+        let elapsed = started_at.elapsed();
+        assert!(
+            elapsed >= sleep_for,
+            "initialization should had taken at least the time time slept with permit"
+        );
+
+        jh.await.unwrap();
+
+        assert_eq!(*cell.get().unwrap(), reinit);
+    }
+
+    #[tokio::test]
+    async fn initialization_attemptable_until_ok() {
+        let cell = OnceCell::default();
+
+        for _ in 0..10 {
+            cell.get_or_init(|| async { Err("whatever error") })
+                .await
+                .unwrap_err();
+        }
+
+        let g = cell
+            .get_or_init(|| async { Ok::<_, Infallible>("finally success") })
+            .await
+            .unwrap();
+        assert_eq!(*g, "finally success");
+    }
+
+    #[tokio::test]
+    async fn initialization_is_cancellation_safe() {
+        let cell = OnceCell::default();
+
+        let barrier = tokio::sync::Barrier::new(2);
+
+        let initializer = cell.get_or_init(|| async {
+            barrier.wait().await;
+            futures::future::pending::<()>().await;
+
+            Ok::<_, Infallible>("never reached")
+        });
+
+        tokio::select! {
+            _ = initializer => { unreachable!("cannot complete; stuck in pending().await") },
+            _ = barrier.wait() => {}
+        };
+
+        // now initializer is dropped
+
+        assert!(cell.get().is_none());
+
+        let g = cell
+            .get_or_init(|| async { Ok::<_, Infallible>("now initialized") })
+            .await
+            .unwrap();
+        assert_eq!(*g, "now initialized");
+    }
+}
--- a/libs/vm_monitor/Cargo.toml
+++ b/libs/vm_monitor/Cargo.toml
@@ -0,0 +1,31 @@
+[package]
+name = "vm_monitor"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[[bin]]
+name = "vm-monitor"
+path = "./src/bin/monitor.rs"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+anyhow.workspace = true
+axum.workspace = true
+clap.workspace = true
+futures.workspace = true
+inotify.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+sysinfo.workspace = true
+tokio.workspace = true
+tokio-postgres.workspace = true
+tokio-stream.workspace = true
+tokio-util.workspace = true
+tracing.workspace = true
+tracing-subscriber.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+
+[target.'cfg(target_os = "linux")'.dependencies]
+cgroups-rs = "0.3.3"
--- a/libs/vm_monitor/README.md
+++ b/libs/vm_monitor/README.md
@@ -0,0 +1,18 @@
+# `vm-monitor`
+
+The `vm-monitor` (or just monitor) is a core component of the autoscaling system,
+along with the `autoscale-scheduler` and the `autoscaler-agent`s. The monitor has
+two primary roles: 1) notifying agents when immediate upscaling is necessary due
+to memory conditions and 2) managing Postgres' file cache and a cgroup to carry
+out upscaling and downscaling decisions.
+
+## More on scaling
+
+We scale CPU and memory using NeonVM, our in-house QEMU tool for use with Kubernetes.
+To control thresholds for receiving memory usage notifications, we start Postgres
+in the `neon-postgres` cgroup and set its `memory.{max,high}`.
+
+* See also: [`neondatabase/autoscaling`](https://github.com/neondatabase/autoscaling/)
+* See also: [`neondatabase/vm-monitor`](https://github.com/neondatabase/vm-monitor/),
+where initial development of the monitor happened. The repository is no longer
+maintained but the commit history may be useful for debugging.
--- a/libs/vm_monitor/src/bin/monitor.rs
+++ b/libs/vm_monitor/src/bin/monitor.rs
@@ -0,0 +1,33 @@
+// We expose a standalone binary _and_ start the monitor in `compute_ctl` so that
+// we can test the monitor as part of the entire autoscaling system in
+// neondatabase/autoscaling.
+//
+// The monitor was previously started by vm-builder, and for testing purposes,
+// we can mimic that setup with this binary.
+
+#[cfg(target_os = "linux")]
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    use clap::Parser;
+    use tokio_util::sync::CancellationToken;
+    use tracing_subscriber::EnvFilter;
+    use vm_monitor::Args;
+
+    let subscriber = tracing_subscriber::fmt::Subscriber::builder()
+        .json()
+        .with_file(true)
+        .with_line_number(true)
+        .with_span_list(true)
+        .with_env_filter(EnvFilter::from_default_env())
+        .finish();
+    tracing::subscriber::set_global_default(subscriber)?;
+
+    let args: &'static Args = Box::leak(Box::new(Args::parse()));
+    let token = CancellationToken::new();
+    vm_monitor::start(args, token).await
+}
+
+#[cfg(not(target_os = "linux"))]
+fn main() {
+    panic!("the monitor requires cgroups, which are only available on linux")
+}
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -0,0 +1,693 @@
+use std::{
+    fmt::{Debug, Display},
+    fs,
+    pin::pin,
+    sync::atomic::{AtomicU64, Ordering},
+};
+
+use anyhow::{anyhow, bail, Context};
+use cgroups_rs::{
+    freezer::FreezerController,
+    hierarchies::{self, is_cgroup2_unified_mode, UNIFIED_MOUNTPOINT},
+    memory::MemController,
+    MaxValue,
+    Subsystem::{Freezer, Mem},
+};
+use inotify::{EventStream, Inotify, WatchMask};
+use tokio::sync::mpsc::{self, error::TryRecvError};
+use tokio::time::{Duration, Instant};
+use tokio_stream::{Stream, StreamExt};
+use tracing::{info, warn};
+
+use crate::protocol::Resources;
+use crate::MiB;
+
+/// Monotonically increasing counter of the number of memory.high events
+/// the cgroup has experienced.
+///
+/// We use this to determine if a modification to the `memory.events` file actually
+/// changed the `high` field. If not, we don't care about the change. When we
+/// read the file, we check the `high` field in the file against `MEMORY_EVENT_COUNT`
+/// to see if it changed since last time.
+pub static MEMORY_EVENT_COUNT: AtomicU64 = AtomicU64::new(0);
+
+/// Monotonically increasing counter that gives each cgroup event a unique id.
+///
+/// This allows us to answer questions like "did this upscale arrive before this
+/// memory.high?". This static is also used by the `Sequenced` type to "tag" values
+/// with a sequence number. As such, prefer to used the `Sequenced` type rather
+/// than this static directly.
+static EVENT_SEQUENCE_NUMBER: AtomicU64 = AtomicU64::new(0);
+
+/// A memory event type reported in memory.events.
+#[derive(Debug, Eq, PartialEq, Copy, Clone)]
+pub enum MemoryEvent {
+    Low,
+    High,
+    Max,
+    Oom,
+    OomKill,
+    OomGroupKill,
+}
+
+impl MemoryEvent {
+    fn as_str(&self) -> &str {
+        match self {
+            MemoryEvent::Low => "low",
+            MemoryEvent::High => "high",
+            MemoryEvent::Max => "max",
+            MemoryEvent::Oom => "oom",
+            MemoryEvent::OomKill => "oom_kill",
+            MemoryEvent::OomGroupKill => "oom_group_kill",
+        }
+    }
+}
+
+impl Display for MemoryEvent {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+
+/// Configuration for a `CgroupWatcher`
+#[derive(Debug, Clone)]
+pub struct Config {
+    // The target difference between the total memory reserved for the cgroup
+    // and the value of the cgroup's memory.high.
+    //
+    // In other words, memory.high + oom_buffer_bytes will equal the total memory that the cgroup may
+    // use (equal to system memory, minus whatever's taken out for the file cache).
+    oom_buffer_bytes: u64,
+
+    // The amount of memory, in bytes, below a proposed new value for
+    // memory.high that the cgroup's memory usage must be for us to downscale
+    //
+    // In other words, we can downscale only when:
+    //
+    //   memory.current + memory_high_buffer_bytes < (proposed) memory.high
+    //
+    // TODO: there's some minor issues with this approach -- in particular, that we might have
+    // memory in use by the kernel's page cache that we're actually ok with getting rid of.
+    pub(crate) memory_high_buffer_bytes: u64,
+
+    // The maximum duration, in milliseconds, that we're allowed to pause
+    // the cgroup for while waiting for the autoscaler-agent to upscale us
+    max_upscale_wait: Duration,
+
+    // The required minimum time, in milliseconds, that we must wait before re-freezing
+    // the cgroup while waiting for the autoscaler-agent to upscale us.
+    do_not_freeze_more_often_than: Duration,
+
+    // The amount of memory, in bytes, that we should periodically increase memory.high
+    // by while waiting for the autoscaler-agent to upscale us.
+    //
+    // This exists to avoid the excessive throttling that happens when a cgroup is above its
+    // memory.high for too long. See more here:
+    // https://github.com/neondatabase/autoscaling/issues/44#issuecomment-1522487217
+    memory_high_increase_by_bytes: u64,
+
+    // The period, in milliseconds, at which we should repeatedly increase the value
+    // of the cgroup's memory.high while we're waiting on upscaling and memory.high
+    // is still being hit.
+    //
+    // Technically speaking, this actually serves as a rate limit to moderate responding to
+    // memory.high events, but these are roughly equivalent if the process is still allocating
+    // memory.
+    memory_high_increase_every: Duration,
+}
+
+impl Config {
+    /// Calculate the new value for the cgroups memory.high based on system memory
+    pub fn calculate_memory_high_value(&self, total_system_mem: u64) -> u64 {
+        total_system_mem.saturating_sub(self.oom_buffer_bytes)
+    }
+}
+
+impl Default for Config {
+    fn default() -> Self {
+        Self {
+            oom_buffer_bytes: 100 * MiB,
+            memory_high_buffer_bytes: 100 * MiB,
+            // while waiting for upscale, don't freeze for more than 20ms every 1s
+            max_upscale_wait: Duration::from_millis(20),
+            do_not_freeze_more_often_than: Duration::from_millis(1000),
+            // while waiting for upscale, increase memory.high by 10MiB every 25ms
+            memory_high_increase_by_bytes: 10 * MiB,
+            memory_high_increase_every: Duration::from_millis(25),
+        }
+    }
+}
+
+/// Used to represent data that is associated with a certain point in time, such
+/// as an upscale request or memory.high event.
+///
+/// Internally, creating a `Sequenced` uses a static atomic counter to obtain
+/// a unique sequence number. Sequence numbers are monotonically increasing,
+/// allowing us to answer questions like "did this upscale happen after this
+/// memory.high event?" by comparing the sequence numbers of the two events.
+#[derive(Debug, Clone)]
+pub struct Sequenced<T> {
+    seqnum: u64,
+    data: T,
+}
+
+impl<T> Sequenced<T> {
+    pub fn new(data: T) -> Self {
+        Self {
+            seqnum: EVENT_SEQUENCE_NUMBER.fetch_add(1, Ordering::AcqRel),
+            data,
+        }
+    }
+}
+
+/// Responds to `MonitorEvents` to manage the cgroup: preventing it from being
+/// OOM killed or throttling.
+///
+/// The `CgroupWatcher` primarily achieves this by reading from a stream of
+/// `MonitorEvent`s. See `main_signals_loop` for details on how to keep the
+/// cgroup happy.
+#[derive(Debug)]
+pub struct CgroupWatcher {
+    pub config: Config,
+
+    /// The sequence number of the last upscale.
+    ///
+    /// If we receive a memory.high event that has a _lower_ sequence number than
+    /// `last_upscale_seqnum`, then we know it occured before the upscale, and we
+    /// can safely ignore it.
+    ///
+    /// Note: Like the `events` field, this doesn't _need_ interior mutability but we
+    /// use it anyways so that methods take `&self`, not `&mut self`.
+    last_upscale_seqnum: AtomicU64,
+
+    /// A channel on which we send messages to request upscale from the dispatcher.
+    upscale_requester: mpsc::Sender<()>,
+
+    /// The actual cgroup we are watching and managing.
+    cgroup: cgroups_rs::Cgroup,
+}
+
+/// Read memory.events for the desired event type.
+///
+/// `path` specifies the path to the desired `memory.events` file.
+/// For more info, see the `memory.events` section of the [kernel docs]
+/// <https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files>
+fn get_event_count(path: &str, event: MemoryEvent) -> anyhow::Result<u64> {
+    let contents = fs::read_to_string(path)
+        .with_context(|| format!("failed to read memory.events from {path}"))?;
+
+    // Then contents of the file look like:
+    // low 42
+    // high 101
+    // ...
+    contents
+        .lines()
+        .filter_map(|s| s.split_once(' '))
+        .find(|(e, _)| *e == event.as_str())
+        .ok_or_else(|| anyhow!("failed to find entry for memory.{event} events in {path}"))
+        .and_then(|(_, count)| {
+            count
+                .parse::<u64>()
+                .with_context(|| format!("failed to parse memory.{event} as u64"))
+        })
+}
+
+/// Create an event stream that produces events whenever the file at the provided
+/// path is modified.
+fn create_file_watcher(path: &str) -> anyhow::Result<EventStream<[u8; 1024]>> {
+    info!("creating file watcher for {path}");
+    let inotify = Inotify::init().context("failed to initialize file watcher")?;
+    inotify
+        .watches()
+        .add(path, WatchMask::MODIFY)
+        .with_context(|| format!("failed to start watching {path}"))?;
+    inotify
+        // The inotify docs use [0u8; 1024] so we'll just copy them. We only need
+        // to store one event at a time - if the event gets written over, that's
+        // ok. We still see that there is an event. For more information, see:
+        // https://man7.org/linux/man-pages/man7/inotify.7.html
+        .into_event_stream([0u8; 1024])
+        .context("failed to start inotify event stream")
+}
+
+impl CgroupWatcher {
+    /// Create a new `CgroupWatcher`.
+    #[tracing::instrument(skip_all, fields(%name))]
+    pub fn new(
+        name: String,
+        // A channel on which to send upscale requests
+        upscale_requester: mpsc::Sender<()>,
+    ) -> anyhow::Result<(Self, impl Stream<Item = Sequenced<u64>>)> {
+        // TODO: clarify exactly why we need v2
+        // Make sure cgroups v2 (aka unified) are supported
+        if !is_cgroup2_unified_mode() {
+            anyhow::bail!("cgroups v2 not supported");
+        }
+        let cgroup = cgroups_rs::Cgroup::load(hierarchies::auto(), &name);
+
+        // Start monitoring the cgroup for memory events. In general, for
+        // cgroups v2 (aka unified), metrics are reported in files like
+        // > `/sys/fs/cgroup/{name}/{metric}`
+        // We are looking for `memory.high` events, which are stored in the
+        // file `memory.events`. For more info, see the `memory.events` section
+        // of https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files
+        let path = format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name);
+        let memory_events = create_file_watcher(&path)
+            .with_context(|| format!("failed to create event watcher for {path}"))?
+            // This would be nice with with .inspect_err followed by .ok
+            .filter_map(move |_| match get_event_count(&path, MemoryEvent::High) {
+                Ok(high) => Some(high),
+                Err(error) => {
+                    // TODO: Might want to just panic here
+                    warn!(?error, "failed to read high events count from {}", &path);
+                    None
+                }
+            })
+            // Only report the event if the memory.high count increased
+            .filter_map(|high| {
+                if MEMORY_EVENT_COUNT.fetch_max(high, Ordering::AcqRel) < high {
+                    Some(high)
+                } else {
+                    None
+                }
+            })
+            .map(Sequenced::new);
+
+        let initial_count = get_event_count(
+            &format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name),
+            MemoryEvent::High,
+        )?;
+
+        info!(initial_count, "initial memory.high event count");
+
+        // Hard update `MEMORY_EVENT_COUNT` since there could have been processes
+        // running in the cgroup before that caused it to be non-zero.
+        MEMORY_EVENT_COUNT.fetch_max(initial_count, Ordering::AcqRel);
+
+        Ok((
+            Self {
+                cgroup,
+                upscale_requester,
+                last_upscale_seqnum: AtomicU64::new(0),
+                config: Default::default(),
+            },
+            memory_events,
+        ))
+    }
+
+    /// The entrypoint for the `CgroupWatcher`.
+    #[tracing::instrument(skip_all)]
+    pub async fn watch<E>(
+        &self,
+        // These are ~dependency injected~ (fancy, I know) because this function
+        // should never return.
+        // -> therefore: when we tokio::spawn it, we don't await the JoinHandle.
+        // -> therefore: if we want to stick it in an Arc so many threads can access
+        //    it, methods can never take mutable access.
+        //     - note: we use the Arc strategy so that a) we can call this function
+        //             right here and b) the runner can call the set/get_memory methods
+        // -> since calling recv() on a tokio::sync::mpsc::Receiver takes &mut self,
+        //    we just pass them in here instead of holding them in fields, as that
+        //    would require this method to take &mut self.
+        mut upscales: mpsc::Receiver<Sequenced<Resources>>,
+        events: E,
+    ) -> anyhow::Result<()>
+    where
+        E: Stream<Item = Sequenced<u64>>,
+    {
+        // There are several actions might do when receiving a `memory.high`,
+        // such as freezing the cgroup, or increasing its `memory.high`. We don't
+        // want to do these things too often (because postgres needs to run, and
+        // we only have so much memory). These timers serve as rate limits for this.
+        let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO));
+        let mut wait_to_increase_memory_high = pin!(tokio::time::sleep(Duration::ZERO));
+        let mut events = pin!(events);
+
+        // Are we waiting to be upscaled? Could be true if we request upscale due
+        // to a memory.high event and it does not arrive in time.
+        let mut waiting_on_upscale = false;
+
+        loop {
+            tokio::select! {
+                upscale = upscales.recv() => {
+                    let Sequenced { seqnum, data } = upscale
+                        .context("failed to listen on upscale notification channel")?;
+                    self.last_upscale_seqnum.store(seqnum, Ordering::Release);
+                    info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
+                }
+                event = events.next() => {
+                    let Some(Sequenced { seqnum, .. }) = event else {
+                        bail!("failed to listen for memory.high events")
+                    };
+                    // The memory.high came before our last upscale, so we consider
+                    // it resolved
+                    if self.last_upscale_seqnum.fetch_max(seqnum, Ordering::AcqRel) > seqnum {
+                        info!(
+                            "received memory.high event, but it came before our last upscale -> ignoring it"
+                        );
+                        continue;
+                    }
+
+                    // The memory.high came after our latest upscale. We don't
+                    // want to do anything yet, so peek the next event in hopes
+                    // that it's an upscale.
+                    if let Some(upscale_num) = self
+                        .upscaled(&mut upscales)
+                        .context("failed to check if we were upscaled")?
+                    {
+                        if upscale_num > seqnum {
+                            info!(
+                                "received memory.high event, but it came before our last upscale -> ignoring it"
+                            );
+                            continue;
+                        }
+                    }
+
+                    // If it's been long enough since we last froze, freeze the
+                    // cgroup and request upscale
+                    if wait_to_freeze.is_elapsed() {
+                        info!("received memory.high event -> requesting upscale");
+                        waiting_on_upscale = self
+                            .handle_memory_high_event(&mut upscales)
+                            .await
+                            .context("failed to handle upscale")?;
+                        wait_to_freeze
+                            .as_mut()
+                            .reset(Instant::now() + self.config.do_not_freeze_more_often_than);
+                        continue;
+                    }
+
+                    // Ok, we can't freeze, just request upscale
+                    if !waiting_on_upscale {
+                        info!("received memory.high event, but too soon to refreeze -> requesting upscale");
+
+                        // Make check to make sure we haven't been upscaled in the
+                        // meantine (can happen if the agent independently decides
+                        // to upscale us again)
+                        if self
+                            .upscaled(&mut upscales)
+                            .context("failed to check if we were upscaled")?
+                            .is_some()
+                        {
+                            info!("no need to request upscaling because we got upscaled");
+                            continue;
+                        }
+                        self.upscale_requester
+                            .send(())
+                            .await
+                            .context("failed to request upscale")?;
+                        continue;
+                    }
+
+                    // Shoot, we can't freeze or and we're still waiting on upscale,
+                    // increase memory.high to reduce throttling
+                    if wait_to_increase_memory_high.is_elapsed() {
+                        info!(
+                            "received memory.high event, \
+                            but too soon to refreeze and already requested upscale \
+                            -> increasing memory.high"
+                        );
+
+                        // Make check to make sure we haven't been upscaled in the
+                        // meantine (can happen if the agent independently decides
+                        // to upscale us again)
+                        if self
+                            .upscaled(&mut upscales)
+                            .context("failed to check if we were upscaled")?
+                            .is_some()
+                        {
+                            info!("no need to increase memory.high because got upscaled");
+                            continue;
+                        }
+
+                        // Request upscale anyways (the agent will handle deduplicating
+                        // requests)
+                        self.upscale_requester
+                            .send(())
+                            .await
+                            .context("failed to request upscale")?;
+
+                        let memory_high =
+                            self.get_high_bytes().context("failed to get memory.high")?;
+                        let new_high = memory_high + self.config.memory_high_increase_by_bytes;
+                        info!(
+                            current_high_bytes = memory_high,
+                            new_high_bytes = new_high,
+                            "updating memory.high"
+                        );
+                        self.set_high_bytes(new_high)
+                            .context("failed to set memory.high")?;
+                        wait_to_increase_memory_high
+                            .as_mut()
+                            .reset(Instant::now() + self.config.memory_high_increase_every)
+                    }
+
+                    // we can't do anything
+                }
+            };
+        }
+    }
+
+    /// Handle a `memory.high`, returning whether we are still waiting on upscale
+    /// by the time the function returns.
+    ///
+    /// The general plan for handling a `memory.high` event is as follows:
+    /// 1. Freeze the cgroup
+    /// 2. Start a timer for `self.config.max_upscale_wait`
+    /// 3. Request upscale
+    /// 4. After the timer elapses or we receive upscale, thaw the cgroup.
+    /// 5. Return whether or not we are still waiting for upscale. If we are,
+    ///    we'll increase the cgroups memory.high to avoid getting oom killed
+    #[tracing::instrument(skip_all)]
+    async fn handle_memory_high_event(
+        &self,
+        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+    ) -> anyhow::Result<bool> {
+        // Immediately freeze the cgroup before doing anything else.
+        info!("received memory.high event -> freezing cgroup");
+        self.freeze().context("failed to freeze cgroup")?;
+
+        // We'll use this for logging durations
+        let start_time = Instant::now();
+
+        // Await the upscale until we have to unfreeze
+        let timed =
+            tokio::time::timeout(self.config.max_upscale_wait, self.await_upscale(upscales));
+
+        // Request the upscale
+        info!(
+            wait = ?self.config.max_upscale_wait,
+            "sending request for immediate upscaling",
+        );
+        self.upscale_requester
+            .send(())
+            .await
+            .context("failed to request upscale")?;
+
+        let waiting_on_upscale = match timed.await {
+            Ok(Ok(())) => {
+                info!(elapsed = ?start_time.elapsed(), "received upscale in time");
+                false
+            }
+            // **important**: unfreeze the cgroup before ?-reporting the error
+            Ok(Err(e)) => {
+                info!("error waiting for upscale -> thawing cgroup");
+                self.thaw()
+                    .context("failed to thaw cgroup after errored waiting for upscale")?;
+                Err(e.context("failed to await upscale"))?
+            }
+            Err(_) => {
+                info!(elapsed = ?self.config.max_upscale_wait, "timed out waiting for upscale");
+                true
+            }
+        };
+
+        info!("thawing cgroup");
+        self.thaw().context("failed to thaw cgroup")?;
+
+        Ok(waiting_on_upscale)
+    }
+
+    /// Checks whether we were just upscaled, returning the upscale's sequence
+    /// number if so.
+    #[tracing::instrument(skip_all)]
+    fn upscaled(
+        &self,
+        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+    ) -> anyhow::Result<Option<u64>> {
+        let Sequenced { seqnum, data } = match upscales.try_recv() {
+            Ok(upscale) => upscale,
+            Err(TryRecvError::Empty) => return Ok(None),
+            Err(TryRecvError::Disconnected) => {
+                bail!("upscale notification channel was disconnected")
+            }
+        };
+
+        // Make sure to update the last upscale sequence number
+        self.last_upscale_seqnum.store(seqnum, Ordering::Release);
+        info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
+        Ok(Some(seqnum))
+    }
+
+    /// Await an upscale event, discarding any `memory.high` events received in
+    /// the process.
+    ///
+    /// This is used in `handle_memory_high_event`, where we need to listen
+    /// for upscales in particular so we know if we can thaw the cgroup early.
+    #[tracing::instrument(skip_all)]
+    async fn await_upscale(
+        &self,
+        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+    ) -> anyhow::Result<()> {
+        let Sequenced { seqnum, .. } = upscales
+            .recv()
+            .await
+            .context("error listening for upscales")?;
+
+        self.last_upscale_seqnum.store(seqnum, Ordering::Release);
+        Ok(())
+    }
+
+    /// Get the cgroup's name.
+    pub fn path(&self) -> &str {
+        self.cgroup.path()
+    }
+}
+
+/// Represents a set of limits we apply to a cgroup to control memory usage.
+///
+/// Setting these values also affects the thresholds for receiving usage alerts.
+#[derive(Debug)]
+pub struct MemoryLimits {
+    high: u64,
+    max: u64,
+}
+
+impl MemoryLimits {
+    pub fn new(high: u64, max: u64) -> Self {
+        Self { max, high }
+    }
+}
+
+// Methods for manipulating the actual cgroup
+impl CgroupWatcher {
+    /// Get a handle on the freezer subsystem.
+    fn freezer(&self) -> anyhow::Result<&FreezerController> {
+        if let Some(Freezer(freezer)) = self
+            .cgroup
+            .subsystems()
+            .iter()
+            .find(|sub| matches!(sub, Freezer(_)))
+        {
+            Ok(freezer)
+        } else {
+            anyhow::bail!("could not find freezer subsystem")
+        }
+    }
+
+    /// Attempt to freeze the cgroup.
+    pub fn freeze(&self) -> anyhow::Result<()> {
+        self.freezer()
+            .context("failed to get freezer subsystem")?
+            .freeze()
+            .context("failed to freeze")
+    }
+
+    /// Attempt to thaw the cgroup.
+    pub fn thaw(&self) -> anyhow::Result<()> {
+        self.freezer()
+            .context("failed to get freezer subsystem")?
+            .thaw()
+            .context("failed to thaw")
+    }
+
+    /// Get a handle on the memory subsystem.
+    ///
+    /// Note: this method does not require `self.memory_update_lock` because
+    /// getting a handle to the subsystem does not access any of the files we
+    /// care about, such as memory.high and memory.events
+    fn memory(&self) -> anyhow::Result<&MemController> {
+        if let Some(Mem(memory)) = self
+            .cgroup
+            .subsystems()
+            .iter()
+            .find(|sub| matches!(sub, Mem(_)))
+        {
+            Ok(memory)
+        } else {
+            anyhow::bail!("could not find memory subsystem")
+        }
+    }
+
+    /// Get cgroup current memory usage.
+    pub fn current_memory_usage(&self) -> anyhow::Result<u64> {
+        Ok(self
+            .memory()
+            .context("failed to get memory subsystem")?
+            .memory_stat()
+            .usage_in_bytes)
+    }
+
+    /// Set cgroup memory.high threshold.
+    pub fn set_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
+        self.memory()
+            .context("failed to get memory subsystem")?
+            .set_mem(cgroups_rs::memory::SetMemory {
+                low: None,
+                high: Some(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64)),
+                min: None,
+                max: None,
+            })
+            .context("failed to set memory.high")
+    }
+
+    /// Set cgroup memory.high and memory.max.
+    pub fn set_limits(&self, limits: &MemoryLimits) -> anyhow::Result<()> {
+        info!(
+            limits.high,
+            limits.max,
+            path = self.path(),
+            "writing new memory limits",
+        );
+        self.memory()
+            .context("failed to get memory subsystem while setting memory limits")?
+            .set_mem(cgroups_rs::memory::SetMemory {
+                min: None,
+                low: None,
+                high: Some(MaxValue::Value(
+                    u64::min(limits.high, i64::MAX as u64) as i64
+                )),
+                max: Some(MaxValue::Value(u64::min(limits.max, i64::MAX as u64) as i64)),
+            })
+            .context("failed to set memory limits")
+    }
+
+    /// Given some amount of available memory, set the desired cgroup memory limits
+    pub fn set_memory_limits(&mut self, available_memory: u64) -> anyhow::Result<()> {
+        let new_high = self.config.calculate_memory_high_value(available_memory);
+        let limits = MemoryLimits::new(new_high, available_memory);
+        info!(
+            path = self.path(),
+            memory = ?limits,
+            "setting cgroup memory",
+        );
+        self.set_limits(&limits)
+            .context("failed to set cgroup memory limits")?;
+        Ok(())
+    }
+
+    /// Get memory.high threshold.
+    pub fn get_high_bytes(&self) -> anyhow::Result<u64> {
+        let high = self
+            .memory()
+            .context("failed to get memory subsystem while getting memory statistics")?
+            .get_mem()
+            .map(|mem| mem.high)
+            .context("failed to get memory statistics from subsystem")?;
+        match high {
+            Some(MaxValue::Max) => Ok(i64::MAX as u64),
+            Some(MaxValue::Value(high)) => Ok(high as u64),
+            None => anyhow::bail!("failed to read memory.high from memory subsystem"),
+        }
+    }
+}
--- a/libs/vm_monitor/src/dispatcher.rs
+++ b/libs/vm_monitor/src/dispatcher.rs
@@ -0,0 +1,155 @@
+//! Managing the websocket connection and other signals in the monitor.
+//!
+//! Contains types that manage the interaction (not data interchange, see `protocol`)
+//! between informant and monitor, allowing us to to process and send messages in a
+//! straightforward way. The dispatcher also manages that signals that come from
+//! the cgroup (requesting upscale), and the signals that go to the cgroup
+//! (notifying it of upscale).
+
+use anyhow::{bail, Context};
+use axum::extract::ws::{Message, WebSocket};
+use futures::{
+    stream::{SplitSink, SplitStream},
+    SinkExt, StreamExt,
+};
+use tokio::sync::mpsc;
+use tracing::info;
+
+use crate::cgroup::Sequenced;
+use crate::protocol::{
+    OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, Resources, PROTOCOL_MAX_VERSION,
+    PROTOCOL_MIN_VERSION,
+};
+
+/// The central handler for all communications in the monitor.
+///
+/// The dispatcher has two purposes:
+/// 1. Manage the connection to the informant, sending and receiving messages.
+/// 2. Communicate with the cgroup manager, notifying it when upscale is received,
+///    and sending a message to the informant when the cgroup manager requests
+///    upscale.
+#[derive(Debug)]
+pub struct Dispatcher {
+    /// We read informant messages of of `source`
+    pub(crate) source: SplitStream<WebSocket>,
+
+    /// We send messages to the informant through `sink`
+    sink: SplitSink<WebSocket, Message>,
+
+    /// Used to notify the cgroup when we are upscaled.
+    pub(crate) notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
+
+    /// When the cgroup requests upscale it will send on this channel. In response
+    /// we send an `UpscaleRequst` to the agent.
+    pub(crate) request_upscale_events: mpsc::Receiver<()>,
+
+    /// The protocol version we have agreed to use with the informant. This is negotiated
+    /// during the creation of the dispatcher, and should be the highest shared protocol
+    /// version.
+    ///
+    // NOTE: currently unused, but will almost certainly be used in the futures
+    // as the protocol changes
+    #[allow(unused)]
+    pub(crate) proto_version: ProtocolVersion,
+}
+
+impl Dispatcher {
+    /// Creates a new dispatcher using the passed-in connection.
+    ///
+    /// Performs a negotiation with the informant to determine the highest protocol
+    /// version that both support. This consists of two steps:
+    /// 1. Wait for the informant to sent the range of protocols it supports.
+    /// 2. Send a protocol version that works for us as well, or an error if there
+    ///    is no compatible version.
+    pub async fn new(
+        stream: WebSocket,
+        notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
+        request_upscale_events: mpsc::Receiver<()>,
+    ) -> anyhow::Result<Self> {
+        let (mut sink, mut source) = stream.split();
+
+        // Figure out the highest protocol version we both support
+        info!("waiting for informant to send protocol version range");
+        let Some(message) = source.next().await else {
+            bail!("websocket connection closed while performing protocol handshake")
+        };
+
+        let message = message.context("failed to read protocol version range off connection")?;
+
+        let Message::Text(message_text) = message else {
+            // All messages should be in text form, since we don't do any
+            // pinging/ponging. See nhooyr/websocket's implementation and the
+            // informant/agent for more info
+            bail!("received non-text message during proocol handshake: {message:?}")
+        };
+
+        let monitor_range = ProtocolRange {
+            min: PROTOCOL_MIN_VERSION,
+            max: PROTOCOL_MAX_VERSION,
+        };
+
+        let informant_range: ProtocolRange = serde_json::from_str(&message_text)
+            .context("failed to deserialize protocol version range")?;
+
+        info!(range = ?informant_range, "received protocol version range");
+
+        let highest_shared_version = match monitor_range.highest_shared_version(&informant_range) {
+            Ok(version) => {
+                sink.send(Message::Text(
+                    serde_json::to_string(&ProtocolResponse::Version(version)).unwrap(),
+                ))
+                .await
+                .context("failed to notify informant of negotiated protocol version")?;
+                version
+            }
+            Err(e) => {
+                sink.send(Message::Text(
+                    serde_json::to_string(&ProtocolResponse::Error(format!(
+                        "Received protocol version range {} which does not overlap with {}",
+                        informant_range, monitor_range
+                    )))
+                    .unwrap(),
+                ))
+                .await
+                .context(
+                    "failed to notify informant of no overlap between protocol version ranges",
+                )?;
+                Err(e).context("error determining suitable protocol version range")?
+            }
+        };
+
+        Ok(Self {
+            sink,
+            source,
+            notify_upscale_events,
+            request_upscale_events,
+            proto_version: highest_shared_version,
+        })
+    }
+
+    /// Notify the cgroup manager that we have received upscale and wait for
+    /// the acknowledgement.
+    #[tracing::instrument(skip_all, fields(?resources))]
+    pub async fn notify_upscale(&self, resources: Sequenced<Resources>) -> anyhow::Result<()> {
+        self.notify_upscale_events
+            .send(resources)
+            .await
+            .context("failed to send resources and oneshot sender across channel")
+    }
+
+    /// Send a message to the informant.
+    ///
+    /// Although this function is small, it has one major benefit: it is the only
+    /// way to send data accross the connection, and you can only pass in a proper
+    /// `MonitorMessage`. Without safeguards like this, it's easy to accidentally
+    /// serialize the wrong thing and send it, since `self.sink.send` will take
+    /// any string.
+    pub async fn send(&mut self, message: OutboundMsg) -> anyhow::Result<()> {
+        info!(?message, "sending message");
+        let json = serde_json::to_string(&message).context("failed to serialize message")?;
+        self.sink
+            .send(Message::Text(json))
+            .await
+            .context("stream error sending message")
+    }
+}
--- a/libs/vm_monitor/src/filecache.rs
+++ b/libs/vm_monitor/src/filecache.rs
@@ -0,0 +1,306 @@
+//! Logic for configuring and scaling the Postgres file cache.
+
+use std::num::NonZeroU64;
+
+use crate::MiB;
+use anyhow::{anyhow, Context};
+use tokio_postgres::{types::ToSql, Client, NoTls, Row};
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info};
+
+/// Manages Postgres' file cache by keeping a connection open.
+#[derive(Debug)]
+pub struct FileCacheState {
+    client: Client,
+    conn_str: String,
+    pub(crate) config: FileCacheConfig,
+
+    /// A token for cancelling spawned threads during shutdown.
+    token: CancellationToken,
+}
+
+#[derive(Debug)]
+pub struct FileCacheConfig {
+    /// Whether the file cache is *actually* stored in memory (e.g. by writing to
+    /// a tmpfs or shmem file). If true, the size of the file cache will be counted against the
+    /// memory available for the cgroup.
+    pub(crate) in_memory: bool,
+
+    /// The size of the file cache, in terms of the size of the resource it consumes
+    /// (currently: only memory)
+    ///
+    /// For example, setting `resource_multipler = 0.75` gives the cache a target size of 75% of total
+    /// resources.
+    ///
+    /// This value must be strictly between 0 and 1.
+    resource_multiplier: f64,
+
+    /// The required minimum amount of memory, in bytes, that must remain available
+    /// after subtracting the file cache.
+    ///
+    /// This value must be non-zero.
+    min_remaining_after_cache: NonZeroU64,
+
+    /// Controls the rate of increase in the file cache's size as it grows from zero
+    /// (when total resources equals min_remaining_after_cache) to the desired size based on
+    /// `resource_multiplier`.
+    ///
+    /// A `spread_factor` of zero means that all additional resources will go to the cache until it
+    /// reaches the desired size. Setting `spread_factor` to N roughly means "for every 1 byte added to
+    /// the cache's size, N bytes are reserved for the rest of the system, until the cache gets to
+    /// its desired size".
+    ///
+    /// This value must be >= 0, and must retain an increase that is more than what would be given by
+    /// `resource_multiplier`. For example, setting `resource_multiplier` = 0.75 but `spread_factor` = 1
+    /// would be invalid, because `spread_factor` would induce only 50% usage - never reaching the 75%
+    /// as desired by `resource_multiplier`.
+    ///
+    /// `spread_factor` is too large if `(spread_factor + 1) * resource_multiplier >= 1`.
+    spread_factor: f64,
+}
+
+impl Default for FileCacheConfig {
+    fn default() -> Self {
+        Self {
+            in_memory: true,
+            // 75 %
+            resource_multiplier: 0.75,
+            // 640 MiB; (512 + 128)
+            min_remaining_after_cache: NonZeroU64::new(640 * MiB).unwrap(),
+            // ensure any increase in file cache size is split 90-10 with 10% to other memory
+            spread_factor: 0.1,
+        }
+    }
+}
+
+impl FileCacheConfig {
+    /// Make sure fields of the config are consistent.
+    pub fn validate(&self) -> anyhow::Result<()> {
+        // Single field validity
+        anyhow::ensure!(
+            0.0 < self.resource_multiplier && self.resource_multiplier < 1.0,
+            "resource_multiplier must be between 0.0 and 1.0 exclusive, got {}",
+            self.resource_multiplier
+        );
+        anyhow::ensure!(
+            self.spread_factor >= 0.0,
+            "spread_factor must be >= 0, got {}",
+            self.spread_factor
+        );
+
+        // Check that `resource_multiplier` and `spread_factor` are valid w.r.t. each other.
+        //
+        // As shown in `calculate_cache_size`, we have two lines resulting from `resource_multiplier` and
+        // `spread_factor`, respectively. They are:
+        //
+        //                 `total`           `min_remaining_after_cache`
+        //   size = ————————————————————— - —————————————————————————————
+        //           `spread_factor` + 1         `spread_factor` + 1
+        //
+        // and
+        //
+        //   size = `resource_multiplier` × total
+        //
+        // .. where `total` is the total resources. These are isomorphic to the typical 'y = mx + b'
+        // form, with y = "size" and x = "total".
+        //
+        // These lines intersect at:
+        //
+        //               `min_remaining_after_cache`
+        //   ———————————————————————————————————————————————————
+        //    1 - `resource_multiplier` × (`spread_factor` + 1)
+        //
+        // We want to ensure that this value (a) exists, and (b) is >= `min_remaining_after_cache`. This is
+        // guaranteed when '`resource_multiplier` × (`spread_factor` + 1)' is less than 1.
+        // (We also need it to be >= 0, but that's already guaranteed.)
+
+        let intersect_factor = self.resource_multiplier * (self.spread_factor + 1.0);
+        anyhow::ensure!(
+            intersect_factor < 1.0,
+            "incompatible resource_multipler and spread_factor"
+        );
+        Ok(())
+    }
+
+    /// Calculate the desired size of the cache, given the total memory
+    pub fn calculate_cache_size(&self, total: u64) -> u64 {
+        // *Note*: all units are in bytes, until the very last line.
+        let available = total.saturating_sub(self.min_remaining_after_cache.get());
+        if available == 0 {
+            return 0;
+        }
+
+        // Conversions to ensure we don't overflow from floating-point ops
+        let size_from_spread =
+            i64::max(0, (available as f64 / (1.0 + self.spread_factor)) as i64) as u64;
+
+        let size_from_normal = (total as f64 * self.resource_multiplier) as u64;
+
+        let byte_size = u64::min(size_from_spread, size_from_normal);
+
+        // The file cache operates in units of mebibytes, so the sizes we produce should
+        // be rounded to a mebibyte. We round down to be conservative.
+        byte_size / MiB * MiB
+    }
+}
+
+impl FileCacheState {
+    /// Connect to the file cache.
+    #[tracing::instrument(skip_all, fields(%conn_str, ?config))]
+    pub async fn new(
+        conn_str: &str,
+        config: FileCacheConfig,
+        token: CancellationToken,
+    ) -> anyhow::Result<Self> {
+        config.validate().context("file cache config is invalid")?;
+
+        info!(conn_str, "connecting to Postgres file cache");
+        let client = FileCacheState::connect(conn_str, token.clone())
+            .await
+            .context("failed to connect to postgres file cache")?;
+
+        let conn_str = conn_str.to_string();
+        Ok(Self {
+            client,
+            config,
+            conn_str,
+            token,
+        })
+    }
+
+    /// Connect to Postgres.
+    ///
+    /// Aborts the spawned thread if the kill signal is received. This is not
+    /// a method as it is called in [`FileCacheState::new`].
+    #[tracing::instrument(skip_all, fields(%conn_str))]
+    async fn connect(conn_str: &str, token: CancellationToken) -> anyhow::Result<Client> {
+        let (client, conn) = tokio_postgres::connect(conn_str, NoTls)
+            .await
+            .context("failed to connect to pg client")?;
+
+        // The connection object performs the actual communication with the database,
+        // so spawn it off to run on its own. See tokio-postgres docs.
+        crate::spawn_with_cancel(
+            token,
+            |res| {
+                if let Err(error) = res {
+                    error!(%error, "postgres error")
+                }
+            },
+            conn,
+        );
+
+        Ok(client)
+    }
+
+    /// Execute a query with a retry if necessary.
+    ///
+    /// If the initial query fails, we restart the database connection and attempt
+    /// if again.
+    #[tracing::instrument(skip_all, fields(%statement))]
+    pub async fn query_with_retry(
+        &mut self,
+        statement: &str,
+        params: &[&(dyn ToSql + Sync)],
+    ) -> anyhow::Result<Vec<Row>> {
+        match self
+            .client
+            .query(statement, params)
+            .await
+            .context("failed to execute query")
+        {
+            Ok(rows) => Ok(rows),
+            Err(e) => {
+                error!(error = ?e, "postgres error: {e} -> retrying");
+
+                let client = FileCacheState::connect(&self.conn_str, self.token.clone())
+                    .await
+                    .context("failed to connect to postgres file cache")?;
+                info!("successfully reconnected to postgres client");
+
+                // Replace the old client and attempt the query with the new one
+                self.client = client;
+                self.client
+                    .query(statement, params)
+                    .await
+                    .context("failed to execute query a second time")
+            }
+        }
+    }
+
+    /// Get the current size of the file cache.
+    #[tracing::instrument(skip_all)]
+    pub async fn get_file_cache_size(&mut self) -> anyhow::Result<u64> {
+        self.query_with_retry(
+            // The file cache GUC variable is in MiB, but the conversion with
+            // pg_size_bytes means that the end result we get is in bytes.
+            "SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit'));",
+            &[],
+        )
+        .await
+        .context("failed to query pg for file cache size")?
+        .first()
+        .ok_or_else(|| anyhow!("file cache size query returned no rows"))?
+        // pg_size_bytes returns a bigint which is the same as an i64.
+        .try_get::<_, i64>(0)
+        // Since the size of the table is not negative, the cast is sound.
+        .map(|bytes| bytes as u64)
+        .context("failed to extract file cache size from query result")
+    }
+
+    /// Attempt to set the file cache size, returning the size it was actually
+    /// set to.
+    #[tracing::instrument(skip_all, fields(%num_bytes))]
+    pub async fn set_file_cache_size(&mut self, num_bytes: u64) -> anyhow::Result<u64> {
+        let max_bytes = self
+            // The file cache GUC variable is in MiB, but the conversion with pg_size_bytes
+            // means that the end result we get is in bytes.
+            .query_with_retry(
+                "SELECT pg_size_bytes(current_setting('neon.max_file_cache_size'));",
+                &[],
+            )
+            .await
+            .context("failed to query pg for max file cache size")?
+            .first()
+            .ok_or_else(|| anyhow!("max file cache size query returned no rows"))?
+            .try_get::<_, i64>(0)
+            .map(|bytes| bytes as u64)
+            .context("failed to extract max file cache size from query result")?;
+
+        let max_mb = max_bytes / MiB;
+        let num_mb = u64::min(num_bytes, max_bytes) / MiB;
+
+        let capped = if num_bytes > max_bytes {
+            " (capped by maximum size)"
+        } else {
+            ""
+        };
+
+        info!(
+            size = num_mb,
+            max = max_mb,
+            "updating file cache size {capped}",
+        );
+
+        // note: even though the normal ways to get the cache size produce values with trailing "MB"
+        // (hence why we call pg_size_bytes in `get_file_cache_size`'s query), the format
+        // it expects to set the value is "integer number of MB" without trailing units.
+        // For some reason, this *really* wasn't working with normal arguments, so that's
+        // why we're constructing the query here.
+        self.client
+            .query(
+                &format!("ALTER SYSTEM SET neon.file_cache_size_limit = {};", num_mb),
+                &[],
+            )
+            .await
+            .context("failed to change file cache size limit")?;
+
+        // must use pg_reload_conf to have the settings change take effect
+        self.client
+            .execute("SELECT pg_reload_conf();", &[])
+            .await
+            .context("failed to reload config")?;
+
+        Ok(num_mb * MiB)
+    }
+}
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -0,0 +1,205 @@
+#![cfg(target_os = "linux")]
+
+use anyhow::Context;
+use axum::{
+    extract::{ws::WebSocket, State, WebSocketUpgrade},
+    response::Response,
+};
+use axum::{routing::get, Router, Server};
+use clap::Parser;
+use futures::Future;
+use std::{fmt::Debug, time::Duration};
+use sysinfo::{RefreshKind, System, SystemExt};
+use tokio::{sync::broadcast, task::JoinHandle};
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info};
+
+use runner::Runner;
+
+// Code that interfaces with agent
+pub mod dispatcher;
+pub mod protocol;
+
+pub mod cgroup;
+pub mod filecache;
+pub mod runner;
+
+/// The vm-monitor is an autoscaling component started by compute_ctl.
+///
+/// It carries out autoscaling decisions (upscaling/downscaling) and responds to
+/// memory pressure by making requests to the autoscaler-agent.
+#[derive(Debug, Parser)]
+pub struct Args {
+    /// The name of the cgroup we should monitor for memory.high events. This
+    /// is the cgroup that postgres should be running in.
+    #[arg(short, long)]
+    pub cgroup: Option<String>,
+
+    /// The connection string for the Postgres file cache we should manage.
+    #[arg(short, long)]
+    pub pgconnstr: Option<String>,
+
+    /// The address we should listen on for connection requests. For the
+    /// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
+    #[arg(short, long)]
+    pub addr: String,
+}
+
+impl Args {
+    pub fn addr(&self) -> &str {
+        &self.addr
+    }
+}
+
+/// The number of bytes in one mebibyte.
+#[allow(non_upper_case_globals)]
+const MiB: u64 = 1 << 20;
+
+/// Convert a quantity in bytes to a quantity in mebibytes, generally for display
+/// purposes. (Most calculations in this crate use bytes directly)
+pub fn bytes_to_mebibytes(bytes: u64) -> f32 {
+    (bytes as f32) / (MiB as f32)
+}
+
+pub fn get_total_system_memory() -> u64 {
+    System::new_with_specifics(RefreshKind::new().with_memory()).total_memory()
+}
+
+/// Global app state for the Axum server
+#[derive(Debug, Clone)]
+pub struct ServerState {
+    /// Used to close old connections.
+    ///
+    /// When a new connection is made, we send a message signalling to the old
+    /// connection to close.
+    pub sender: broadcast::Sender<()>,
+
+    /// Used to cancel all spawned threads in the monitor.
+    pub token: CancellationToken,
+
+    // The CLI args
+    pub args: &'static Args,
+}
+
+/// Spawn a thread that may get cancelled by the provided [`CancellationToken`].
+///
+/// This is mainly meant to be called with futures that will be pending for a very
+/// long time, or are not mean to return. If it is not desirable for the future to
+/// ever resolve, such as in the case of [`cgroup::CgroupWatcher::watch`], the error can
+/// be logged with `f`.
+pub fn spawn_with_cancel<T, F>(
+    token: CancellationToken,
+    f: F,
+    future: T,
+) -> JoinHandle<Option<T::Output>>
+where
+    T: Future + Send + 'static,
+    T::Output: Send + 'static,
+    F: FnOnce(&T::Output) + Send + 'static,
+{
+    tokio::spawn(async move {
+        tokio::select! {
+            _ = token.cancelled() => {
+                info!("received global kill signal");
+                None
+            }
+            res = future => {
+                f(&res);
+                Some(res)
+            }
+        }
+    })
+}
+
+/// The entrypoint to the binary.
+///
+/// Set up tracing, parse arguments, and start an http server.
+pub async fn start(args: &'static Args, token: CancellationToken) -> anyhow::Result<()> {
+    // This channel is used to close old connections. When a new connection is
+    // made, we send a message signalling to the old connection to close.
+    let (sender, _) = tokio::sync::broadcast::channel::<()>(1);
+
+    let app = Router::new()
+        // This route gets upgraded to a websocket connection. We only support
+        // one connection at a time, which we enforce by killing old connections
+        // when we receive a new one.
+        .route("/monitor", get(ws_handler))
+        .with_state(ServerState {
+            sender,
+            token,
+            args,
+        });
+
+    let addr = args.addr();
+    let bound = Server::try_bind(&addr.parse().expect("parsing address should not fail"))
+        .with_context(|| format!("failed to bind to {addr}"))?;
+
+    info!(addr, "server bound");
+
+    bound
+        .serve(app.into_make_service())
+        .await
+        .context("server exited")?;
+
+    Ok(())
+}
+
+/// Handles incoming websocket connections.
+///
+/// If we are already to connected to an informant, we kill that old connection
+/// and accept the new one.
+#[tracing::instrument(name = "/monitor", skip_all, fields(?args))]
+pub async fn ws_handler(
+    ws: WebSocketUpgrade,
+    State(ServerState {
+        sender,
+        token,
+        args,
+    }): State<ServerState>,
+) -> Response {
+    // Kill the old monitor
+    info!("closing old connection if there is one");
+    let _ = sender.send(());
+
+    // Start the new one. Wow, the cycle of death and rebirth
+    let closer = sender.subscribe();
+    ws.on_upgrade(|ws| start_monitor(ws, args, closer, token))
+}
+
+/// Starts the monitor. If startup fails or the monitor exits, an error will
+/// be logged and our internal state will be reset to allow for new connections.
+#[tracing::instrument(skip_all, fields(?args))]
+async fn start_monitor(
+    ws: WebSocket,
+    args: &Args,
+    kill: broadcast::Receiver<()>,
+    token: CancellationToken,
+) {
+    info!("accepted new websocket connection -> starting monitor");
+    let timeout = Duration::from_secs(4);
+    let monitor = tokio::time::timeout(
+        timeout,
+        Runner::new(Default::default(), args, ws, kill, token),
+    )
+    .await;
+    let mut monitor = match monitor {
+        Ok(Ok(monitor)) => monitor,
+        Ok(Err(error)) => {
+            error!(?error, "failed to create monitor");
+            return;
+        }
+        Err(_) => {
+            error!(
+                ?timeout,
+                "creating monitor timed out (probably waiting to receive protocol range)"
+            );
+            return;
+        }
+    };
+    info!("connected to informant");
+
+    match monitor.run().await {
+        Ok(()) => info!("monitor was killed due to new connection"),
+        Err(e) => error!(error = ?e, "monitor terminated unexpectedly"),
+    }
+}
--- a/libs/vm_monitor/src/protocol.rs
+++ b/libs/vm_monitor/src/protocol.rs
@@ -0,0 +1,241 @@
+//! Types representing protocols and actual informant-monitor messages.
+//!
+//! The pervasive use of serde modifiers throughout this module is to ease
+//! serialization on the go side. Because go does not have enums (which model
+//! messages well), it is harder to model messages, and we accomodate that with
+//! serde.
+//!
+//! *Note*: the informant sends and receives messages in different ways.
+//!
+//! The informant serializes messages in the form and then sends them. The use
+//! of `#[serde(tag = "type", content = "content")]` allows us to use `Type`
+//! to determine how to deserialize `Content`.
+//! ```ignore
+//! struct {
+//!     Content any
+//!     Type    string
+//!     Id      uint64
+//! }
+//! ```
+//! and receives messages in the form:
+//! ```ignore
+//! struct {
+//!     {fields embedded}
+//!     Type string
+//!     Id   uint64
+//! }
+//! ```
+//! After reading the type field, the informant will decode the entire message
+//! again, this time into the correct type using the embedded fields.
+//! Because the informant cannot just extract the json contained in a certain field
+//! (it initially deserializes to `map[string]interface{}`), we keep the fields
+//! at the top level, so the entire piece of json can be deserialized into a struct,
+//! such as a `DownscaleResult`, with the `Type` and `Id` fields ignored.
+
+use core::fmt;
+use std::cmp;
+
+use serde::{de::Error, Deserialize, Serialize};
+
+/// A Message we send to the informant.
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct OutboundMsg {
+    #[serde(flatten)]
+    pub(crate) inner: OutboundMsgKind,
+    pub(crate) id: usize,
+}
+
+impl OutboundMsg {
+    pub fn new(inner: OutboundMsgKind, id: usize) -> Self {
+        Self { inner, id }
+    }
+}
+
+/// The different underlying message types we can send to the informant.
+#[derive(Serialize, Deserialize, Debug, Clone)]
+#[serde(tag = "type")]
+pub enum OutboundMsgKind {
+    /// Indicates that the informant sent an invalid message, i.e, we couldn't
+    /// properly deserialize it.
+    InvalidMessage { error: String },
+    /// Indicates that we experienced an internal error while processing a message.
+    /// For example, if a cgroup operation fails while trying to handle an upscale,
+    /// we return `InternalError`.
+    InternalError { error: String },
+    /// Returned to the informant once we have finished handling an upscale. If the
+    /// handling was unsuccessful, an `InternalError` will get returned instead.
+    /// *Note*: this is a struct variant because of the way go serializes struct{}
+    UpscaleConfirmation {},
+    /// Indicates to the monitor that we are urgently requesting resources.
+    /// *Note*: this is a struct variant because of the way go serializes struct{}
+    UpscaleRequest {},
+    /// Returned to the informant once we have finished attempting to downscale. If
+    /// an error occured trying to do so, an `InternalError` will get returned instead.
+    /// However, if we are simply unsuccessful (for example, do to needing the resources),
+    /// that gets included in the `DownscaleResult`.
+    DownscaleResult {
+        // FIXME for the future (once the informant is deprecated)
+        // As of the time of writing, the informant/agent version of this struct is
+        // called api.DownscaleResult. This struct has uppercase fields which are
+        // serialized as such. Thus, we serialize using uppercase names so we don't
+        // have to make a breaking change to the agent<->informant protocol. Once
+        // the informant has been superseded by the monitor, we can add the correct
+        // struct tags to api.DownscaleResult without causing a breaking change,
+        // since we don't need to support the agent<->informant protocol anymore.
+        #[serde(rename = "Ok")]
+        ok: bool,
+        #[serde(rename = "Status")]
+        status: String,
+    },
+    /// Part of the bidirectional heartbeat. The heartbeat is initiated by the
+    /// informant.
+    /// *Note*: this is a struct variant because of the way go serializes struct{}
+    HealthCheck {},
+}
+
+/// A message received form the informant.
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct InboundMsg {
+    #[serde(flatten)]
+    pub(crate) inner: InboundMsgKind,
+    pub(crate) id: usize,
+}
+
+/// The different underlying message types we can receive from the informant.
+#[derive(Serialize, Deserialize, Debug, Clone)]
+#[serde(tag = "type", content = "content")]
+pub enum InboundMsgKind {
+    /// Indicates that the we sent an invalid message, i.e, we couldn't
+    /// properly deserialize it.
+    InvalidMessage { error: String },
+    /// Indicates that the informan experienced an internal error while processing
+    /// a message. For example, if it failed to request upsacle from the agent, it
+    /// would return an `InternalError`.
+    InternalError { error: String },
+    /// Indicates to us that we have been granted more resources. We should respond
+    /// with an `UpscaleConfirmation` when done handling the resources (increasins
+    /// file cache size, cgorup memory limits).
+    UpscaleNotification { granted: Resources },
+    /// A request to reduce resource usage. We should response with a `DownscaleResult`,
+    /// when done.
+    DownscaleRequest { target: Resources },
+    /// Part of the bidirectional heartbeat. The heartbeat is initiated by the
+    /// informant.
+    /// *Note*: this is a struct variant because of the way go serializes struct{}
+    HealthCheck {},
+}
+
+/// Represents the resources granted to a VM.
+#[derive(Serialize, Deserialize, Debug, Clone, Copy)]
+// Renamed because the agent/informant has multiple resources types:
+// `Resources` (milliCPU/memory slots)
+// `Allocation` (vCPU/bytes) <- what we correspond to
+#[serde(rename(serialize = "Allocation", deserialize = "Allocation"))]
+pub struct Resources {
+    /// Number of vCPUs
+    pub(crate) cpu: f64,
+    /// Bytes of memory
+    pub(crate) mem: u64,
+}
+
+impl Resources {
+    pub fn new(cpu: f64, mem: u64) -> Self {
+        Self { cpu, mem }
+    }
+}
+
+pub const PROTOCOL_MIN_VERSION: ProtocolVersion = ProtocolVersion::V1_0;
+pub const PROTOCOL_MAX_VERSION: ProtocolVersion = ProtocolVersion::V1_0;
+
+#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Ord, Eq, Serialize, Deserialize)]
+pub struct ProtocolVersion(u8);
+
+impl ProtocolVersion {
+    /// Represents v1.0 of the informant<-> monitor protocol - the initial version
+    ///
+    /// Currently the latest version.
+    const V1_0: ProtocolVersion = ProtocolVersion(1);
+}
+
+impl fmt::Display for ProtocolVersion {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match *self {
+            ProtocolVersion(0) => f.write_str("<invalid: zero>"),
+            ProtocolVersion::V1_0 => f.write_str("v1.0"),
+            other => write!(f, "<unknown: {other}>"),
+        }
+    }
+}
+
+/// A set of protocol bounds that determines what we are speaking.
+///
+/// These bounds are inclusive.
+#[derive(Debug)]
+pub struct ProtocolRange {
+    pub min: ProtocolVersion,
+    pub max: ProtocolVersion,
+}
+
+// Use a custom deserialize impl to ensure that `self.min <= self.max`
+impl<'de> Deserialize<'de> for ProtocolRange {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        #[derive(Deserialize)]
+        struct InnerProtocolRange {
+            min: ProtocolVersion,
+            max: ProtocolVersion,
+        }
+        let InnerProtocolRange { min, max } = InnerProtocolRange::deserialize(deserializer)?;
+        if min > max {
+            Err(D::Error::custom(format!(
+                "min version = {min} is greater than max version = {max}",
+            )))
+        } else {
+            Ok(ProtocolRange { min, max })
+        }
+    }
+}
+
+impl fmt::Display for ProtocolRange {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        if self.min == self.max {
+            f.write_fmt(format_args!("{}", self.max))
+        } else {
+            f.write_fmt(format_args!("{} to {}", self.min, self.max))
+        }
+    }
+}
+
+impl ProtocolRange {
+    /// Find the highest shared version between two `ProtocolRange`'s
+    pub fn highest_shared_version(&self, other: &Self) -> anyhow::Result<ProtocolVersion> {
+        // We first have to make sure the ranges are overlapping. Once we know
+        // this, we can merge the ranges by taking the max of the mins and the
+        // mins of the maxes.
+        if self.min > other.max {
+            anyhow::bail!(
+                "Non-overlapping bounds: other.max = {} was less than self.min = {}",
+                other.max,
+                self.min,
+            )
+        } else if self.max < other.min {
+            anyhow::bail!(
+                "Non-overlappinng bounds: self.max = {} was less than other.min = {}",
+                self.max,
+                other.min
+            )
+        } else {
+            Ok(cmp::min(self.max, other.max))
+        }
+    }
+}
+
+/// We send this to the monitor after negotiating which protocol to use
+#[derive(Serialize, Debug)]
+#[serde(rename_all = "camelCase")]
+pub enum ProtocolResponse {
+    Error(String),
+    Version(ProtocolVersion),
+}
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -0,0 +1,460 @@
+//! Exposes the `Runner`, which handles messages received from informant and
+//! sends upscale requests.
+//!
+//! This is the "Monitor" part of the monitor binary and is the main entrypoint for
+//! all functionality.
+
+use std::sync::Arc;
+use std::{fmt::Debug, mem};
+
+use anyhow::{bail, Context};
+use axum::extract::ws::{Message, WebSocket};
+use futures::StreamExt;
+use tokio::sync::broadcast;
+use tokio::sync::mpsc;
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info, warn};
+
+use crate::cgroup::{CgroupWatcher, MemoryLimits, Sequenced};
+use crate::dispatcher::Dispatcher;
+use crate::filecache::{FileCacheConfig, FileCacheState};
+use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
+use crate::{bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel, Args, MiB};
+
+/// Central struct that interacts with informant, dispatcher, and cgroup to handle
+/// signals from the informant.
+#[derive(Debug)]
+pub struct Runner {
+    config: Config,
+    filecache: Option<FileCacheState>,
+    cgroup: Option<Arc<CgroupWatcher>>,
+    dispatcher: Dispatcher,
+
+    /// We "mint" new message ids by incrementing this counter and taking the value.
+    ///
+    /// **Note**: This counter is always odd, so that we avoid collisions between the IDs generated
+    /// by us vs the autoscaler-agent.
+    counter: usize,
+
+    /// A signal to kill the main thread produced by `self.run()`. This is triggered
+    /// when the server receives a new connection. When the thread receives the
+    /// signal off this channel, it will gracefully shutdown.
+    kill: broadcast::Receiver<()>,
+}
+
+/// Configuration for a `Runner`
+#[derive(Debug)]
+pub struct Config {
+    /// `sys_buffer_bytes` gives the estimated amount of memory, in bytes, that the kernel uses before
+    /// handing out the rest to userspace. This value is the estimated difference between the
+    /// *actual* physical memory and the amount reported by `grep MemTotal /proc/meminfo`.
+    ///
+    /// For more information, refer to `man 5 proc`, which defines MemTotal as "Total usable RAM
+    /// (i.e., physical RAM minus a few reserved bits and the kernel binary code)".
+    ///
+    /// We only use `sys_buffer_bytes` when calculating the system memory from the *external* memory
+    /// size, rather than the self-reported memory size, according to the kernel.
+    ///
+    /// TODO: this field is only necessary while we still have to trust the autoscaler-agent's
+    /// upscale resource amounts (because we might not *actually* have been upscaled yet). This field
+    /// should be removed once we have a better solution there.
+    sys_buffer_bytes: u64,
+}
+
+impl Default for Config {
+    fn default() -> Self {
+        Self {
+            sys_buffer_bytes: 100 * MiB,
+        }
+    }
+}
+
+impl Runner {
+    /// Create a new monitor.
+    #[tracing::instrument(skip_all, fields(?config, ?args))]
+    pub async fn new(
+        config: Config,
+        args: &Args,
+        ws: WebSocket,
+        kill: broadcast::Receiver<()>,
+        token: CancellationToken,
+    ) -> anyhow::Result<Runner> {
+        anyhow::ensure!(
+            config.sys_buffer_bytes != 0,
+            "invalid monitor Config: sys_buffer_bytes cannot be 0"
+        );
+
+        // *NOTE*: the dispatcher and cgroup manager talk through these channels
+        // so make sure they each get the correct half, nothing is droppped, etc.
+        let (notified_send, notified_recv) = mpsc::channel(1);
+        let (requesting_send, requesting_recv) = mpsc::channel(1);
+
+        let dispatcher = Dispatcher::new(ws, notified_send, requesting_recv)
+            .await
+            .context("error creating new dispatcher")?;
+
+        let mut state = Runner {
+            config,
+            filecache: None,
+            cgroup: None,
+            dispatcher,
+            counter: 1, // NB: must be odd, see the comment about the field for more.
+            kill,
+        };
+
+        let mut file_cache_reserved_bytes = 0;
+        let mem = get_total_system_memory();
+
+        // We need to process file cache initialization before cgroup initialization, so that the memory
+        // allocated to the file cache is appropriately taken into account when we decide the cgroup's
+        // memory limits.
+        if let Some(connstr) = &args.pgconnstr {
+            info!("initializing file cache");
+            let config: FileCacheConfig = Default::default();
+            if !config.in_memory {
+                panic!("file cache not in-memory implemented")
+            }
+
+            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
+                .await
+                .context("failed to create file cache")?;
+
+            let size = file_cache
+                .get_file_cache_size()
+                .await
+                .context("error getting file cache size")?;
+
+            let new_size = file_cache.config.calculate_cache_size(mem);
+            info!(
+                initial = bytes_to_mebibytes(size),
+                new = bytes_to_mebibytes(new_size),
+                "setting initial file cache size",
+            );
+
+            // note: even if size == new_size, we want to explicitly set it, just
+            // to make sure that we have the permissions to do so
+            let actual_size = file_cache
+                .set_file_cache_size(new_size)
+                .await
+                .context("failed to set file cache size, possibly due to inadequate permissions")?;
+            if actual_size != new_size {
+                info!("file cache size actually got set to {actual_size}")
+            }
+            file_cache_reserved_bytes = actual_size;
+
+            state.filecache = Some(file_cache);
+        }
+
+        if let Some(name) = &args.cgroup {
+            let (mut cgroup, cgroup_event_stream) =
+                CgroupWatcher::new(name.clone(), requesting_send)
+                    .context("failed to create cgroup manager")?;
+
+            let available = mem - file_cache_reserved_bytes;
+
+            cgroup
+                .set_memory_limits(available)
+                .context("failed to set cgroup memory limits")?;
+
+            let cgroup = Arc::new(cgroup);
+
+            // Some might call this . . . cgroup v2
+            let cgroup_clone = Arc::clone(&cgroup);
+
+            spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move {
+                cgroup_clone.watch(notified_recv, cgroup_event_stream).await
+            });
+
+            state.cgroup = Some(cgroup);
+        } else {
+            // *NOTE*: We need to forget the sender so that its drop impl does not get ran.
+            // This allows us to poll it in `Monitor::run` regardless of whether we
+            // are managing a cgroup or not. If we don't forget it, all receives will
+            // immediately return an error because the sender is droped and it will
+            // claim all select! statements, effectively turning `Monitor::run` into
+            // `loop { fail to receive }`.
+            mem::forget(requesting_send);
+        }
+
+        Ok(state)
+    }
+
+    /// Attempt to downscale filecache + cgroup
+    #[tracing::instrument(skip_all, fields(?target))]
+    pub async fn try_downscale(&mut self, target: Resources) -> anyhow::Result<(bool, String)> {
+        // Nothing to adjust
+        if self.cgroup.is_none() && self.filecache.is_none() {
+            info!("no action needed for downscale (no cgroup or file cache enabled)");
+            return Ok((
+                true,
+                "monitor is not managing cgroup or file cache".to_string(),
+            ));
+        }
+
+        let requested_mem = target.mem;
+        let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
+        let expected_file_cache_mem_usage = self
+            .filecache
+            .as_ref()
+            .map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
+            .unwrap_or(0);
+        let mut new_cgroup_mem_high = 0;
+        if let Some(cgroup) = &self.cgroup {
+            new_cgroup_mem_high = cgroup
+                .config
+                .calculate_memory_high_value(usable_system_memory - expected_file_cache_mem_usage);
+
+            let current = cgroup
+                .current_memory_usage()
+                .context("failed to fetch cgroup memory")?;
+
+            if new_cgroup_mem_high < current + cgroup.config.memory_high_buffer_bytes {
+                let status = format!(
+                    "{}: {} MiB (new high) < {} (current usage) + {} (buffer)",
+                    "calculated memory.high too low",
+                    bytes_to_mebibytes(new_cgroup_mem_high),
+                    bytes_to_mebibytes(current),
+                    bytes_to_mebibytes(cgroup.config.memory_high_buffer_bytes)
+                );
+
+                info!(status, "discontinuing downscale");
+
+                return Ok((false, status));
+            }
+        }
+
+        // The downscaling has been approved. Downscale the file cache, then the cgroup.
+        let mut status = vec![];
+        let mut file_cache_mem_usage = 0;
+        if let Some(file_cache) = &mut self.filecache {
+            if !file_cache.config.in_memory {
+                panic!("file cache not in-memory unimplemented")
+            }
+
+            let actual_usage = file_cache
+                .set_file_cache_size(expected_file_cache_mem_usage)
+                .await
+                .context("failed to set file cache size")?;
+            file_cache_mem_usage = actual_usage;
+            let message = format!(
+                "set file cache size to {} MiB",
+                bytes_to_mebibytes(actual_usage)
+            );
+            info!("downscale: {message}");
+            status.push(message);
+        }
+
+        if let Some(cgroup) = &self.cgroup {
+            let available_memory = usable_system_memory - file_cache_mem_usage;
+
+            if file_cache_mem_usage != expected_file_cache_mem_usage {
+                new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
+            }
+
+            let limits = MemoryLimits::new(
+                // new_cgroup_mem_high is initialized to 0 but it is guarancontextd to not be here
+                // since it is properly initialized in the previous cgroup if let block
+                new_cgroup_mem_high,
+                available_memory,
+            );
+            cgroup
+                .set_limits(&limits)
+                .context("failed to set cgroup memory limits")?;
+
+            let message = format!(
+                "set cgroup memory.high to {} MiB, of new max {} MiB",
+                bytes_to_mebibytes(new_cgroup_mem_high),
+                bytes_to_mebibytes(available_memory)
+            );
+            info!("downscale: {message}");
+            status.push(message);
+        }
+
+        // TODO: make this status thing less jank
+        let status = status.join("; ");
+        Ok((true, status))
+    }
+
+    /// Handle new resources
+    #[tracing::instrument(skip_all, fields(?resources))]
+    pub async fn handle_upscale(&mut self, resources: Resources) -> anyhow::Result<()> {
+        if self.filecache.is_none() && self.cgroup.is_none() {
+            info!("no action needed for upscale (no cgroup or file cache enabled)");
+            return Ok(());
+        }
+
+        let new_mem = resources.mem;
+        let usable_system_memory = new_mem.saturating_sub(self.config.sys_buffer_bytes);
+
+        // Get the file cache's expected contribution to the memory usage
+        let mut file_cache_mem_usage = 0;
+        if let Some(file_cache) = &mut self.filecache {
+            if !file_cache.config.in_memory {
+                panic!("file cache not in-memory unimplemented");
+            }
+
+            let expected_usage = file_cache.config.calculate_cache_size(usable_system_memory);
+            info!(
+                target = bytes_to_mebibytes(expected_usage),
+                total = bytes_to_mebibytes(new_mem),
+                "updating file cache size",
+            );
+
+            let actual_usage = file_cache
+                .set_file_cache_size(expected_usage)
+                .await
+                .context("failed to set file cache size")?;
+
+            if actual_usage != expected_usage {
+                warn!(
+                    "file cache was set to a different size that we wanted: target = {} Mib, actual= {} Mib",
+                    bytes_to_mebibytes(expected_usage),
+                    bytes_to_mebibytes(actual_usage)
+                )
+            }
+            file_cache_mem_usage = actual_usage;
+        }
+
+        if let Some(cgroup) = &self.cgroup {
+            let available_memory = usable_system_memory - file_cache_mem_usage;
+            let new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
+            info!(
+                target = bytes_to_mebibytes(new_cgroup_mem_high),
+                total = bytes_to_mebibytes(new_mem),
+                name = cgroup.path(),
+                "updating cgroup memory.high",
+            );
+            let limits = MemoryLimits::new(new_cgroup_mem_high, available_memory);
+            cgroup
+                .set_limits(&limits)
+                .context("failed to set file cache size")?;
+        }
+
+        Ok(())
+    }
+
+    /// Take in a message and perform some action, such as downscaling or upscaling,
+    /// and return a message to be send back.
+    #[tracing::instrument(skip_all, fields(%id, message = ?inner))]
+    pub async fn process_message(
+        &mut self,
+        InboundMsg { inner, id }: InboundMsg,
+    ) -> anyhow::Result<Option<OutboundMsg>> {
+        match inner {
+            InboundMsgKind::UpscaleNotification { granted } => {
+                self.handle_upscale(granted)
+                    .await
+                    .context("failed to handle upscale")?;
+                self.dispatcher
+                    .notify_upscale(Sequenced::new(granted))
+                    .await
+                    .context("failed to notify notify cgroup of upscale")?;
+                Ok(Some(OutboundMsg::new(
+                    OutboundMsgKind::UpscaleConfirmation {},
+                    id,
+                )))
+            }
+            InboundMsgKind::DownscaleRequest { target } => self
+                .try_downscale(target)
+                .await
+                .context("failed to downscale")
+                .map(|(ok, status)| {
+                    Some(OutboundMsg::new(
+                        OutboundMsgKind::DownscaleResult { ok, status },
+                        id,
+                    ))
+                }),
+            InboundMsgKind::InvalidMessage { error } => {
+                warn!(
+                    %error, id, "received notification of an invalid message we sent"
+                );
+                Ok(None)
+            }
+            InboundMsgKind::InternalError { error } => {
+                warn!(error, id, "informant experienced an internal error");
+                Ok(None)
+            }
+            InboundMsgKind::HealthCheck {} => {
+                Ok(Some(OutboundMsg::new(OutboundMsgKind::HealthCheck {}, id)))
+            }
+        }
+    }
+
+    // TODO: don't propagate errors, probably just warn!?
+    #[tracing::instrument(skip_all)]
+    pub async fn run(&mut self) -> anyhow::Result<()> {
+        info!("starting dispatcher");
+        loop {
+            tokio::select! {
+                signal = self.kill.recv() => {
+                    match signal {
+                        Ok(()) => return Ok(()),
+                        Err(e) => bail!("failed to receive kill signal: {e}")
+                    }
+                }
+                // we need to propagate an upscale request
+                request = self.dispatcher.request_upscale_events.recv() => {
+                    if request.is_none() {
+                        bail!("failed to listen for upscale event from cgroup")
+                    }
+                    info!("cgroup asking for upscale; forwarding request");
+                    self.counter += 2; // Increment, preserving parity (i.e. keep the
+                                       // counter odd). See the field comment for more.
+                    self.dispatcher
+                        .send(OutboundMsg::new(OutboundMsgKind::UpscaleRequest {}, self.counter))
+                        .await
+                        .context("failed to send message")?;
+                }
+                // there is a message from the informant
+                msg = self.dispatcher.source.next() => {
+                    if let Some(msg) = msg {
+                        // Don't use 'message' as a key as the string also uses
+                        // that for its key
+                        info!(?msg, "received message");
+                        match msg {
+                            Ok(msg) => {
+                                let message: InboundMsg = match msg {
+                                    Message::Text(text) => {
+                                        serde_json::from_str(&text).context("failed to deserialize text message")?
+                                    }
+                                    other => {
+                                        warn!(
+                                            // Don't use 'message' as a key as the
+                                            // string also uses that for its key
+                                            msg = ?other,
+                                            "informant should only send text messages but received different type"
+                                        );
+                                        continue
+                                    },
+                                };
+
+                                let out = match self.process_message(message.clone()).await {
+                                    Ok(Some(out)) => out,
+                                    Ok(None) => continue,
+                                    Err(e) => {
+                                        let error = e.to_string();
+                                        warn!(?error, "error handling message");
+                                        OutboundMsg::new(
+                                            OutboundMsgKind::InternalError {
+                                                error
+                                            },
+                                            message.id
+                                        )
+                                    }
+                                };
+
+                                self.dispatcher
+                                    .send(out)
+                                    .await
+                                    .context("failed to send message")?;
+                            }
+                            Err(e) => warn!("{e}"),
+                        }
+                    } else {
+                        anyhow::bail!("dispatcher connection closed")
+                    }
+                }
+            }
+        }
+    }
+}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -51,6 +51,7 @@ serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 serde_with.workspace = true
 signal-hook.workspace = true
+smallvec = { workspace = true, features = ["write"] }
 svg_fmt.workspace = true
 sync_wrapper.workspace = true
 tokio-tar.workspace = true
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -215,7 +215,6 @@ fn bench_sequential(c: &mut Criterion) {
            TimelineId::generate(),
            zero.add(10 * i32)..zero.add(10 * i32 + 1),
            Lsn(i),
-            false,
            0,
        );
        updates.insert_historic(layer);
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -10,7 +10,7 @@ use std::{fs, path::Path, str};

 use pageserver::page_cache::PAGE_SZ;
 use pageserver::repository::{Key, KEY_SIZE};
-use pageserver::tenant::block_io::{BlockReader, FileBlockReader};
+use pageserver::tenant::block_io::FileBlockReader;
 use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
 use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
 use pageserver::tenant::storage_layer::range_overlaps;
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -44,8 +44,6 @@ pub(crate) enum LayerCmd {
 }

 async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
-    use pageserver::tenant::block_io::BlockReader;
-
    let path = path.as_ref();
    virtual_file::init(10);
    page_cache::init(100);
@@ -70,7 +68,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
            },
        )
        .await?;
-    let cursor = BlockCursor::new(&file);
+    let cursor = BlockCursor::new_fileblockreader_virtual(&file);
    for (k, v) in all {
        let value = cursor.read_blob(v.pos()).await?;
        println!("key:{} value_len:{}", k, value.len());
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -60,7 +60,11 @@ use utils::serde_percent::Percent;
 use crate::{
    config::PageServerConf,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
-    tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
+    tenant::{
+        self,
+        storage_layer::{AsLayerDesc, EvictionError, Layer},
+        Timeline,
+    },
 };

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -108,7 +112,7 @@ pub fn launch_disk_usage_global_eviction_task(
                _ = background_jobs_barrier.wait() => { }
            };

-            disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
+            disk_usage_eviction_task(&state, task_config, &storage, &conf.tenants_path(), cancel)
                .await;
            Ok(())
        },
@@ -121,7 +125,7 @@ pub fn launch_disk_usage_global_eviction_task(
 async fn disk_usage_eviction_task(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    storage: GenericRemoteStorage,
+    _storage: &GenericRemoteStorage,
    tenants_dir: &Path,
    cancel: CancellationToken,
 ) {
@@ -145,14 +149,8 @@ async fn disk_usage_eviction_task(
        let start = Instant::now();

        async {
-            let res = disk_usage_eviction_task_iteration(
-                state,
-                task_config,
-                &storage,
-                tenants_dir,
-                &cancel,
-            )
-            .await;
+            let res =
+                disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await;

            match res {
                Ok(()) => {}
@@ -183,13 +181,12 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
 async fn disk_usage_eviction_task_iteration(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    storage: &GenericRemoteStorage,
    tenants_dir: &Path,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
        .context("get filesystem-level disk usage before evictions")?;
-    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
+    let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await;
    match res {
        Ok(outcome) => {
            debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -273,7 +270,6 @@ struct LayerCount {

 pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    state: &State,
-    storage: &GenericRemoteStorage,
    usage_pre: U,
    cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
@@ -330,9 +326,10 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // If we get far enough in the list that we start to evict layers that are below
    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
    // usage at that point, in 'usage_planned_min_resident_size_respecting'.
-    let mut batched: HashMap<_, Vec<Arc<dyn PersistentLayer>>> = HashMap::new();
+    let mut batched: HashMap<_, Vec<_>> = HashMap::new();
    let mut warned = None;
    let mut usage_planned = usage_pre;
+    let mut max_batch_size = 0;
    for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
        if !usage_planned.has_pressure() {
            debug!(
@@ -349,10 +346,15 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

        usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);

-        batched
-            .entry(TimelineKey(candidate.timeline))
-            .or_default()
-            .push(candidate.layer);
+        let batch = batched.entry(TimelineKey(candidate.timeline)).or_default();
+
+        // semaphore will later be used to limit eviction concurrency, and we can express at
+        // most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
+        // but fail gracefully by not making batches larger.
+        if batch.len() < u32::MAX as usize {
+            batch.push(candidate.layer);
+            max_batch_size = max_batch_size.max(batch.len());
+        }
    }

    let usage_planned = match warned {
@@ -369,64 +371,101 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

    // phase2: evict victims batched by timeline

-    // After the loop, `usage_assumed` is the post-eviction usage,
-    // according to internal accounting.
-    let mut usage_assumed = usage_pre;
-    let mut evictions_failed = LayerCount::default();
+    let mut js = tokio::task::JoinSet::new();
+
+    // ratelimit to 1k files or any higher max batch size
+    let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
+
    for (timeline, batch) in batched {
        let tenant_id = timeline.tenant_id;
        let timeline_id = timeline.timeline_id;
-        let batch_size = batch.len();
+        let batch_size =
+            u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
+
+        // I dislike naming of `available_permits` but it means current total amount of permits
+        // because permits can be added
+        assert!(batch_size as usize <= limit.available_permits());

        debug!(%timeline_id, "evicting batch for timeline");

-        async {
-            let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;
+        let evict = {
+            let limit = limit.clone();
+            let cancel = cancel.clone();
+            async move {
+                let mut evicted_bytes = 0;
+                let mut evictions_failed = LayerCount::default();

-            match results {
-                Err(e) => {
-                    warn!("failed to evict batch: {:#}", e);
-                }
-                Ok(results) => {
-                    assert_eq!(results.len(), batch.len());
-                    for (result, layer) in results.into_iter().zip(batch.iter()) {
-                        let file_size = layer.layer_desc().file_size;
-                        match result {
-                            Some(Ok(())) => {
-                                usage_assumed.add_available_bytes(file_size);
-                            }
-                            Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
-                                unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
-                            }
-                            Some(Err(EvictionError::FileNotFound)) => {
-                                evictions_failed.file_sizes += file_size;
-                                evictions_failed.count += 1;
-                            }
-                            Some(Err(
-                                e @ EvictionError::LayerNotFound(_)
-                                | e @ EvictionError::StatFailed(_),
-                            )) => {
-                                let e = utils::error::report_compact_sources(&e);
-                                warn!(%layer, "failed to evict layer: {e}");
-                                evictions_failed.file_sizes += file_size;
-                                evictions_failed.count += 1;
-                            }
-                            None => {
-                                assert!(cancel.is_cancelled());
-                                return;
+                let Ok(_permit) = limit.acquire_many_owned(batch_size).await else {
+                    // semaphore closing means cancelled
+                    return (evicted_bytes, evictions_failed);
+                };
+
+                let results = timeline.evict_layers(&batch, &cancel).await;
+
+                match results {
+                    Ok(results) => {
+                        assert_eq!(results.len(), batch.len());
+                        for (result, layer) in results.into_iter().zip(batch.iter()) {
+                            let file_size = layer.layer_desc().file_size;
+                            match result {
+                                Some(Ok(())) => {
+                                    evicted_bytes += file_size;
+                                }
+                                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
+                                    evictions_failed.file_sizes += file_size;
+                                    evictions_failed.count += 1;
+                                }
+                                None => {
+                                    assert!(cancel.is_cancelled());
+                                }
                            }
                        }
                    }
+                    Err(e) => {
+                        warn!("failed to evict batch: {:#}", e);
+                    }
                }
+                (evicted_bytes, evictions_failed)
            }
        }
-        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
-        .await;
+        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size));

-        if cancel.is_cancelled() {
+        js.spawn(evict);
+
+        // spwaning multiple thousands of these is essentially blocking, so give already spawned a
+        // chance of making progress
+        tokio::task::yield_now().await;
+    }
+
+    let join_all = async move {
+        // After the evictions, `usage_assumed` is the post-eviction usage,
+        // according to internal accounting.
+        let mut usage_assumed = usage_pre;
+        let mut evictions_failed = LayerCount::default();
+
+        while let Some(res) = js.join_next().await {
+            match res {
+                Ok((evicted_bytes, failed)) => {
+                    usage_assumed.add_available_bytes(evicted_bytes);
+                    evictions_failed.file_sizes += failed.file_sizes;
+                    evictions_failed.count += failed.count;
+                }
+                Err(je) if je.is_cancelled() => unreachable!("not used"),
+                Err(je) if je.is_panic() => { /* already logged */ }
+                Err(je) => tracing::error!("unknown JoinError: {je:?}"),
+            }
+        }
+        (usage_assumed, evictions_failed)
+    };
+
+    let (usage_assumed, evictions_failed) = tokio::select! {
+        tuple = join_all => { tuple },
+        _ = cancel.cancelled() => {
+            // close the semaphore to stop any pending acquires
+            limit.close();
            return Ok(IterationOutcome::Cancelled);
        }
-    }
+    };

    Ok(IterationOutcome::Finished(IterationOutcomeFinished {
        before: usage_pre,
@@ -441,7 +480,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
 #[derive(Clone)]
 struct EvictionCandidate {
    timeline: Arc<Timeline>,
-    layer: Arc<dyn PersistentLayer>,
+    layer: Layer,
    last_activity_ts: SystemTime,
 }

--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1028,7 +1028,7 @@ async fn timeline_compact_handler(
        timeline
            .compact(&cancel, &ctx)
            .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;
        json_response(StatusCode::OK, ())
    }
    .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
@@ -1053,7 +1053,7 @@ async fn timeline_checkpoint_handler(
        timeline
            .compact(&cancel, &ctx)
            .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;

        json_response(StatusCode::OK, ())
    }
@@ -1160,11 +1160,11 @@ async fn disk_usage_eviction_run(

    let state = get_state(&r);

-    let Some(storage) = state.remote_storage.clone() else {
+    if state.remote_storage.as_ref().is_none() {
        return Err(ApiError::InternalServerError(anyhow::anyhow!(
            "remote storage not configured, cannot run eviction iteration"
        )));
-    };
+    }

    let state = state.disk_usage_eviction_state.clone();

@@ -1182,7 +1182,6 @@ async fn disk_usage_eviction_run(
        async move {
            let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
                &state,
-                &storage,
                usage,
                &child_cancel,
            )
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -6,7 +6,7 @@ use metrics::{
    HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
-use strum::VariantNames;
+use strum::{EnumCount, IntoEnumIterator, VariantNames};
 use strum_macros::{EnumVariantNames, IntoStaticStr};
 use utils::id::{TenantId, TimelineId};

@@ -570,23 +570,160 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-const SMGR_QUERY_TIME_OPERATIONS: &[&str] = &[
-    "get_rel_exists",
-    "get_rel_size",
-    "get_page_at_lsn",
-    "get_db_size",
-];
+#[derive(Debug)]
+struct GlobalAndPerTimelineHistogram {
+    global: Histogram,
+    per_tenant_timeline: Histogram,
+}

-pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+impl GlobalAndPerTimelineHistogram {
+    fn observe(&self, value: f64) {
+        self.global.observe(value);
+        self.per_tenant_timeline.observe(value);
+    }
+}
+
+struct GlobalAndPerTimelineHistogramTimer<'a> {
+    h: &'a GlobalAndPerTimelineHistogram,
+    start: std::time::Instant,
+}
+
+impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> {
+    fn drop(&mut self) {
+        let elapsed = self.start.elapsed();
+        self.h.observe(elapsed.as_secs_f64());
+    }
+}
+
+#[derive(
+    Debug,
+    Clone,
+    Copy,
+    IntoStaticStr,
+    strum_macros::EnumCount,
+    strum_macros::EnumIter,
+    strum_macros::FromRepr,
+)]
+#[strum(serialize_all = "snake_case")]
+pub enum SmgrQueryType {
+    GetRelExists,
+    GetRelSize,
+    GetPageAtLsn,
+    GetDbSize,
+}
+
+#[derive(Debug)]
+pub struct SmgrQueryTimePerTimeline {
+    metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
+}
+
+static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_smgr_query_seconds",
-        "Time spent on smgr query handling",
+        "Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
        &["smgr_query_type", "tenant_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });

+static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_smgr_query_seconds_global",
+        "Time spent on smgr query handling, aggregated by query type.",
+        &["smgr_query_type"],
+        CRITICAL_OP_BUCKETS.into(),
+    )
+    .expect("failed to define a metric")
+});
+
+impl SmgrQueryTimePerTimeline {
+    pub(crate) fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
+        let tenant_id = tenant_id.to_string();
+        let timeline_id = timeline_id.to_string();
+        let metrics = std::array::from_fn(|i| {
+            let op = SmgrQueryType::from_repr(i).unwrap();
+            let global = SMGR_QUERY_TIME_GLOBAL
+                .get_metric_with_label_values(&[op.into()])
+                .unwrap();
+            let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
+                .get_metric_with_label_values(&[op.into(), &tenant_id, &timeline_id])
+                .unwrap();
+            GlobalAndPerTimelineHistogram {
+                global,
+                per_tenant_timeline,
+            }
+        });
+        Self { metrics }
+    }
+    pub(crate) fn start_timer(&self, op: SmgrQueryType) -> impl Drop + '_ {
+        let metric = &self.metrics[op as usize];
+        GlobalAndPerTimelineHistogramTimer {
+            h: metric,
+            start: std::time::Instant::now(),
+        }
+    }
+}
+
+#[cfg(test)]
+mod smgr_query_time_tests {
+    use strum::IntoEnumIterator;
+    use utils::id::{TenantId, TimelineId};
+
+    // Regression test, we used hard-coded string constants before using an enum.
+    #[test]
+    fn op_label_name() {
+        use super::SmgrQueryType::*;
+        let expect: [(super::SmgrQueryType, &'static str); 4] = [
+            (GetRelExists, "get_rel_exists"),
+            (GetRelSize, "get_rel_size"),
+            (GetPageAtLsn, "get_page_at_lsn"),
+            (GetDbSize, "get_db_size"),
+        ];
+        for (op, expect) in expect {
+            let actual: &'static str = op.into();
+            assert_eq!(actual, expect);
+        }
+    }
+
+    #[test]
+    fn basic() {
+        let ops: Vec<_> = super::SmgrQueryType::iter().collect();
+
+        for op in &ops {
+            let tenant_id = TenantId::generate();
+            let timeline_id = TimelineId::generate();
+            let metrics = super::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
+
+            let get_counts = || {
+                let global: u64 = ops
+                    .iter()
+                    .map(|op| metrics.metrics[*op as usize].global.get_sample_count())
+                    .sum();
+                let per_tenant_timeline: u64 = ops
+                    .iter()
+                    .map(|op| {
+                        metrics.metrics[*op as usize]
+                            .per_tenant_timeline
+                            .get_sample_count()
+                    })
+                    .sum();
+                (global, per_tenant_timeline)
+            };
+
+            let (pre_global, pre_per_tenant_timeline) = get_counts();
+            assert_eq!(pre_per_tenant_timeline, 0);
+
+            let timer = metrics.start_timer(*op);
+            drop(timer);
+
+            let (post_global, post_per_tenant_timeline) = get_counts();
+            assert_eq!(post_per_tenant_timeline, 1);
+            assert!(post_global > pre_global);
+        }
+    }
+}
+
 // keep in sync with control plane Go code so that we can validate
 // compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
 static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
@@ -1045,6 +1182,12 @@ impl Drop for TimelineMetrics {
            .write()
            .unwrap()
            .remove(tenant_id, timeline_id);
+
+        // The following metrics are born outside of the TimelineMetrics lifecycle but still
+        // removed at the end of it. The idea is to have the metrics outlive the
+        // entity during which they're observed, e.g., the smgr metrics shall
+        // outlive an individual smgr connection, but not the timeline.
+
        for op in StorageTimeOperation::VARIANTS {
            let _ =
                STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
@@ -1056,8 +1199,12 @@ impl Drop for TimelineMetrics {
            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
        }

-        for op in SMGR_QUERY_TIME_OPERATIONS {
-            let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
+        for op in SmgrQueryType::iter() {
+            let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
+                op.into(),
+                tenant_id,
+                timeline_id,
+            ]);
        }
    }
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -50,7 +50,8 @@ use crate::basebackup;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
-use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
+use crate::metrics;
+use crate::metrics::LIVE_CONNECTIONS_COUNT;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant;
@@ -306,39 +307,6 @@ async fn page_service_conn_main(
    }
 }

-struct PageRequestMetrics {
-    get_rel_exists: metrics::Histogram,
-    get_rel_size: metrics::Histogram,
-    get_page_at_lsn: metrics::Histogram,
-    get_db_size: metrics::Histogram,
-}
-
-impl PageRequestMetrics {
-    fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
-        let tenant_id = tenant_id.to_string();
-        let timeline_id = timeline_id.to_string();
-
-        let get_rel_exists =
-            SMGR_QUERY_TIME.with_label_values(&["get_rel_exists", &tenant_id, &timeline_id]);
-
-        let get_rel_size =
-            SMGR_QUERY_TIME.with_label_values(&["get_rel_size", &tenant_id, &timeline_id]);
-
-        let get_page_at_lsn =
-            SMGR_QUERY_TIME.with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id]);
-
-        let get_db_size =
-            SMGR_QUERY_TIME.with_label_values(&["get_db_size", &tenant_id, &timeline_id]);
-
-        Self {
-            get_rel_exists,
-            get_rel_size,
-            get_page_at_lsn,
-            get_db_size,
-        }
-    }
-}
-
 struct PageServerHandler {
    _conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
@@ -406,7 +374,7 @@ impl PageServerHandler {
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
        pgb.flush().await?;

-        let metrics = PageRequestMetrics::new(&tenant_id, &timeline_id);
+        let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);

        loop {
            let msg = tokio::select! {
@@ -446,21 +414,21 @@ impl PageServerHandler {

            let response = match neon_fe_msg {
                PagestreamFeMessage::Exists(req) => {
-                    let _timer = metrics.get_rel_exists.start_timer();
+                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelExists);
                    self.handle_get_rel_exists_request(&timeline, &req, &ctx)
                        .await
                }
                PagestreamFeMessage::Nblocks(req) => {
-                    let _timer = metrics.get_rel_size.start_timer();
+                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelSize);
                    self.handle_get_nblocks_request(&timeline, &req, &ctx).await
                }
                PagestreamFeMessage::GetPage(req) => {
-                    let _timer = metrics.get_page_at_lsn.start_timer();
+                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetPageAtLsn);
                    self.handle_get_page_at_lsn_request(&timeline, &req, &ctx)
                        .await
                }
                PagestreamFeMessage::DbSize(req) => {
-                    let _timer = metrics.get_db_size.start_timer();
+                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetDbSize);
                    self.handle_db_size_request(&timeline, &req, &ctx).await
                }
            };
@@ -984,8 +952,8 @@ where
                false
            };

-            metrics::metric_vec_duration::observe_async_block_duration_by_result(
-                &*crate::metrics::BASEBACKUP_QUERY_TIME,
+            ::metrics::metric_vec_duration::observe_async_block_duration_by_result(
+                &*metrics::BASEBACKUP_QUERY_TIME,
                async move {
                    self.handle_basebackup_request(
                        pgb,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -72,7 +72,6 @@ use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
-use crate::tenant::storage_layer::Layer;
 use crate::InitializationOrder;

 use crate::tenant::timeline::delete::DeleteTimelineFlow;
@@ -134,9 +133,7 @@ pub(crate) mod timeline;
 pub mod size;

 pub(crate) use timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
-pub use timeline::{
-    LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline,
-};
+pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};

 // re-export for use in remote_timeline_client.rs
 pub use crate::tenant::metadata::save_metadata;
@@ -423,13 +420,53 @@ impl Tenant {
            init_order,
            CreateTimelineCause::Load,
        )?;
-        let new_disk_consistent_lsn = timeline.get_disk_consistent_lsn();
+        let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
        anyhow::ensure!(
-            new_disk_consistent_lsn.is_valid(),
+            disk_consistent_lsn.is_valid(),
            "Timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
        );
+        assert_eq!(
+            disk_consistent_lsn,
+            up_to_date_metadata.disk_consistent_lsn(),
+            "these are used interchangeably"
+        );
+
+        // Save the metadata file to local disk.
+        if !picked_local {
+            save_metadata(
+                self.conf,
+                &tenant_id,
+                &timeline_id,
+                up_to_date_metadata,
+                first_save,
+            )
+            .context("save_metadata")?;
+        }
+
+        let index_part = remote_startup_data.as_ref().map(|x| &x.index_part);
+
+        if let Some(index_part) = index_part {
+            timeline
+                .remote_client
+                .as_ref()
+                .unwrap()
+                .init_upload_queue(index_part)?;
+        } else if self.remote_storage.is_some() {
+            // No data on the remote storage, but we have local metadata file. We can end up
+            // here with timeline_create being interrupted before finishing index part upload.
+            // By doing what we do here, the index part upload is retried.
+            // If control plane retries timeline creation in the meantime, the mgmt API handler
+            // for timeline creation will coalesce on the upload we queue here.
+            let rtc = timeline.remote_client.as_ref().unwrap();
+            rtc.init_upload_queue_for_empty_remote(up_to_date_metadata)?;
+            rtc.schedule_index_upload_for_metadata_update(up_to_date_metadata)?;
+        }
+
        timeline
-            .load_layer_map(new_disk_consistent_lsn)
+            .load_layer_map(
+                disk_consistent_lsn,
+                remote_startup_data.map(|x| x.index_part),
+            )
            .await
            .with_context(|| {
                format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}")
@@ -453,19 +490,6 @@ impl Tenant {
            }
        };

-        if self.remote_storage.is_some() {
-            // Reconcile local state with remote storage, downloading anything that's
-            // missing locally, and scheduling uploads for anything that's missing
-            // in remote storage.
-            timeline
-                .reconcile_with_remote(
-                    up_to_date_metadata,
-                    remote_startup_data.as_ref().map(|r| &r.index_part),
-                )
-                .await
-                .context("failed to reconcile with remote")?
-        }
-
        // Sanity check: a timeline should have some content.
        anyhow::ensure!(
            ancestor.is_some()
@@ -480,18 +504,6 @@ impl Tenant {
            "Timeline has no ancestor and no layer files"
        );

-        // Save the metadata file to local disk.
-        if !picked_local {
-            save_metadata(
-                self.conf,
-                &tenant_id,
-                &timeline_id,
-                up_to_date_metadata,
-                first_save,
-            )
-            .context("save_metadata")?;
-        }
-
        Ok(())
    }

@@ -684,10 +696,7 @@ impl Tenant {
            debug!("successfully downloaded index part for timeline {timeline_id}");
            match index_part {
                MaybeDeletedIndexPart::IndexPart(index_part) => {
-                    timeline_ancestors.insert(
-                        timeline_id,
-                        index_part.parse_metadata().context("parse_metadata")?,
-                    );
+                    timeline_ancestors.insert(timeline_id, index_part.metadata.clone());
                    remote_index_and_client.insert(timeline_id, (index_part, client));
                }
                MaybeDeletedIndexPart::Deleted(index_part) => {
@@ -738,7 +747,7 @@ impl Tenant {
            DeleteTimelineFlow::resume_deletion(
                Arc::clone(self),
                timeline_id,
-                &index_part.parse_metadata().context("parse_metadata")?,
+                &index_part.metadata,
                Some(remote_timeline_client),
                None,
            )
@@ -1300,10 +1309,7 @@ impl Tenant {
                        }
                    };

-                    let remote_metadata = index_part
-                        .parse_metadata()
-                        .context("parse_metadata")
-                        .map_err(LoadLocalTimelineError::Load)?;
+                    let remote_metadata = index_part.metadata.clone();
                    (
                        Some(RemoteStartupData {
                            index_part,
@@ -4033,6 +4039,7 @@ mod tests {

    #[tokio::test]
    async fn delta_layer_dumping() -> anyhow::Result<()> {
+        use storage_layer::AsLayerDesc;
        let (tenant, ctx) = TenantHarness::create("test_layer_dumping")?.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -4040,16 +4047,18 @@ mod tests {
        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;

        let layer_map = tline.layers.read().await;
-        let level0_deltas = layer_map.layer_map().get_level0_deltas()?;
+        let level0_deltas = layer_map
+            .layer_map()
+            .get_level0_deltas()?
+            .into_iter()
+            .map(|desc| layer_map.get_from_desc(&desc))
+            .collect::<Vec<_>>();

        assert!(!level0_deltas.is_empty());

        for delta in level0_deltas {
-            let delta = layer_map.get_from_desc(&delta);
            // Ensure we are dumping a delta layer here
-            let delta = delta.downcast_delta_layer().unwrap();
-
-            delta.dump(false, &ctx).await.unwrap();
+            assert!(delta.layer_desc().is_delta);
            delta.dump(true, &ctx).await.unwrap();
        }

@@ -4093,7 +4102,7 @@ mod tests {
        let mut found_error_message = false;
        let mut err_source = err.source();
        while let Some(source) = err_source {
-            if source.to_string() == "metadata checksum mismatch" {
+            if source.to_string().contains("metadata checksum mismatch") {
                found_error_message = true;
                break;
            }
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -12,14 +12,11 @@
 //! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
 use crate::page_cache::PAGE_SZ;
-use crate::tenant::block_io::{BlockCursor, BlockReader};
+use crate::tenant::block_io::BlockCursor;
 use std::cmp::min;
 use std::io::{Error, ErrorKind};

-impl<R> BlockCursor<R>
-where
-    R: BlockReader,
-{
+impl<'a> BlockCursor<'a> {
    /// Read a blob into a new buffer.
    pub async fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
        let mut buf = Vec::new();
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -2,8 +2,12 @@
 //! Low-level Block-oriented I/O functions
 //!

+use super::ephemeral_file::EphemeralFile;
+use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
+use crate::virtual_file::VirtualFile;
 use bytes::Bytes;
+use std::fs::File;
 use std::ops::{Deref, DerefMut};
 use std::os::unix::fs::FileExt;

@@ -13,32 +17,20 @@ use std::os::unix::fs::FileExt;
 /// There are currently two implementations: EphemeralFile, and FileBlockReader
 /// below.
 pub trait BlockReader {
-    ///
-    /// Read a block. Returns a "lease" object that can be used to
-    /// access to the contents of the page. (For the page cache, the
-    /// lease object represents a lock on the buffer.)
-    ///
-    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error>;
-
    ///
    /// Create a new "cursor" for reading from this reader.
    ///
    /// A cursor caches the last accessed page, allowing for faster
    /// access if the same block is accessed repeatedly.
-    fn block_cursor(&self) -> BlockCursor<&Self>
-    where
-        Self: Sized,
-    {
-        BlockCursor::new(self)
-    }
+    fn block_cursor(&self) -> BlockCursor<'_>;
 }

 impl<B> BlockReader for &B
 where
    B: BlockReader,
 {
-    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
-        (*self).read_blk(blknum)
+    fn block_cursor(&self) -> BlockCursor<'_> {
+        (*self).block_cursor()
    }
 }

@@ -76,6 +68,34 @@ impl<'a> Deref for BlockLease<'a> {
    }
 }

+/// Provides the ability to read blocks from different sources,
+/// similar to using traits for this purpose.
+///
+/// Unlike traits, we also support the read function to be async though.
+pub(crate) enum BlockReaderRef<'a> {
+    FileBlockReaderVirtual(&'a FileBlockReader<VirtualFile>),
+    FileBlockReaderFile(&'a FileBlockReader<std::fs::File>),
+    EphemeralFile(&'a EphemeralFile),
+    Adapter(Adapter<&'a DeltaLayerInner>),
+    #[cfg(test)]
+    TestDisk(&'a super::disk_btree::tests::TestDisk),
+}
+
+impl<'a> BlockReaderRef<'a> {
+    #[inline(always)]
+    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+        use BlockReaderRef::*;
+        match self {
+            FileBlockReaderVirtual(r) => r.read_blk(blknum),
+            FileBlockReaderFile(r) => r.read_blk(blknum),
+            EphemeralFile(r) => r.read_blk(blknum),
+            Adapter(r) => r.read_blk(blknum),
+            #[cfg(test)]
+            TestDisk(r) => r.read_blk(blknum),
+        }
+    }
+}
+
 ///
 /// A "cursor" for efficiently reading multiple pages from a BlockReader
 ///
@@ -93,21 +113,27 @@ impl<'a> Deref for BlockLease<'a> {
 /// // do stuff with 'buf'
 /// ```
 ///
-pub struct BlockCursor<R>
-where
-    R: BlockReader,
-{
-    reader: R,
+pub struct BlockCursor<'a> {
+    reader: BlockReaderRef<'a>,
 }

-impl<R> BlockCursor<R>
-where
-    R: BlockReader,
-{
-    pub fn new(reader: R) -> Self {
+impl<'a> BlockCursor<'a> {
+    pub(crate) fn new(reader: BlockReaderRef<'a>) -> Self {
        BlockCursor { reader }
    }
+    // Needed by cli
+    pub fn new_fileblockreader_virtual(reader: &'a FileBlockReader<VirtualFile>) -> Self {
+        BlockCursor {
+            reader: BlockReaderRef::FileBlockReaderVirtual(reader),
+        }
+    }

+    /// Read a block.
+    ///
+    /// Returns a "lease" object that can be used to
+    /// access to the contents of the page. (For the page cache, the
+    /// lease object represents a lock on the buffer.)
+    #[inline(always)]
    pub fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        self.reader.read_blk(blknum)
    }
@@ -139,13 +165,12 @@ where
        assert!(buf.len() == PAGE_SZ);
        self.file.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
    }
-}
-
-impl<F> BlockReader for FileBlockReader<F>
-where
-    F: FileExt,
-{
-    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+    /// Read a block.
+    ///
+    /// Returns a "lease" object that can be used to
+    /// access to the contents of the page. (For the page cache, the
+    /// lease object represents a lock on the buffer.)
+    pub fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        let cache = page_cache::get();
        loop {
            match cache
@@ -170,6 +195,18 @@ where
    }
 }

+impl BlockReader for FileBlockReader<File> {
+    fn block_cursor(&self) -> BlockCursor<'_> {
+        BlockCursor::new(BlockReaderRef::FileBlockReaderFile(self))
+    }
+}
+
+impl BlockReader for FileBlockReader<VirtualFile> {
+    fn block_cursor(&self) -> BlockCursor<'_> {
+        BlockCursor::new(BlockReaderRef::FileBlockReaderVirtual(self))
+    }
+}
+
 ///
 /// Trait for block-oriented output
 ///
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -7,6 +7,7 @@ use anyhow::Context;
 use pageserver_api::models::TenantState;
 use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use tokio::sync::OwnedMutexGuard;
+use tokio_util::sync::CancellationToken;
 use tracing::{error, info, instrument, warn, Instrument, Span};

 use utils::{
@@ -82,6 +83,8 @@ async fn create_remote_delete_mark(
        FAILED_UPLOAD_WARN_THRESHOLD,
        FAILED_REMOTE_OP_RETRIES,
        "mark_upload",
+        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+        backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
    )
    .await
    .context("mark_upload")?;
@@ -171,6 +174,8 @@ async fn remove_tenant_remote_delete_mark(
            FAILED_UPLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "remove_tenant_remote_delete_mark",
+            // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
        )
        .await
        .context("remove_tenant_remote_delete_mark")?;
@@ -252,6 +257,8 @@ pub(crate) async fn remote_delete_mark_exists(
        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
        "fetch_tenant_deletion_mark",
+        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+        backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
    )
    .await;

--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -259,9 +259,10 @@ where
    {
        let mut stack = Vec::new();
        stack.push((self.root_blk, None));
+        let block_cursor = self.reader.block_cursor();
        while let Some((node_blknum, opt_iter)) = stack.pop() {
            // Locate the node.
-            let node_buf = self.reader.read_blk(self.start_blk + node_blknum)?;
+            let node_buf = block_cursor.read_blk(self.start_blk + node_blknum)?;

            let node = OnDiskNode::deparse(node_buf.as_ref())?;
            let prefix_len = node.prefix_len as usize;
@@ -353,8 +354,10 @@ where

        stack.push((self.root_blk, String::new(), 0, 0, 0));

+        let block_cursor = self.reader.block_cursor();
+
        while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
-            let blk = self.reader.read_blk(self.start_blk + blknum)?;
+            let blk = block_cursor.read_blk(self.start_blk + blknum)?;
            let buf: &[u8] = blk.as_ref();
            let node = OnDiskNode::<L>::deparse(buf)?;

@@ -683,29 +686,32 @@ impl<const L: usize> BuildNode<L> {
 }

 #[cfg(test)]
-mod tests {
+pub(crate) mod tests {
    use super::*;
-    use crate::tenant::block_io::BlockLease;
+    use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};
    use rand::Rng;
    use std::collections::BTreeMap;
    use std::sync::atomic::{AtomicUsize, Ordering};

    #[derive(Clone, Default)]
-    struct TestDisk {
+    pub(crate) struct TestDisk {
        blocks: Vec<Bytes>,
    }
    impl TestDisk {
        fn new() -> Self {
            Self::default()
        }
-    }
-    impl BlockReader for TestDisk {
-        fn read_blk(&self, blknum: u32) -> io::Result<BlockLease> {
+        pub(crate) fn read_blk(&self, blknum: u32) -> io::Result<BlockLease> {
            let mut buf = [0u8; PAGE_SZ];
            buf.copy_from_slice(&self.blocks[blknum as usize]);
            Ok(std::rc::Rc::new(buf).into())
        }
    }
+    impl BlockReader for TestDisk {
+        fn block_cursor(&self) -> BlockCursor<'_> {
+            BlockCursor::new(BlockReaderRef::TestDisk(self))
+        }
+    }
    impl BlockWriter for &mut TestDisk {
        fn write_blk(&mut self, buf: Bytes) -> io::Result<u32> {
            let blknum = self.blocks.len();
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -3,8 +3,7 @@

 use crate::config::PageServerConf;
 use crate::page_cache::{self, PAGE_SZ};
-use crate::tenant::blob_io::BlobWriter;
-use crate::tenant::block_io::{BlockLease, BlockReader};
+use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
 use crate::virtual_file::VirtualFile;
 use std::cmp::min;
 use std::fs::OpenOptions;
@@ -22,7 +21,7 @@ pub struct EphemeralFile {
    _tenant_id: TenantId,
    _timeline_id: TimelineId,
    file: VirtualFile,
-    size: u64,
+    len: u64,
    /// An ephemeral file is append-only.
    /// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
    /// The other pages, which can no longer be modified, are accessed through the page cache.
@@ -53,27 +52,56 @@ impl EphemeralFile {
            _tenant_id: tenant_id,
            _timeline_id: timeline_id,
            file,
-            size: 0,
+            len: 0,
            mutable_tail: [0u8; PAGE_SZ],
        })
    }

-    pub(crate) fn size(&self) -> u64 {
-        self.size
+    pub(crate) fn len(&self) -> u64 {
+        self.len
    }
-}

-/// Does the given filename look like an ephemeral file?
-pub fn is_ephemeral_file(filename: &str) -> bool {
-    if let Some(rest) = filename.strip_prefix("ephemeral-") {
-        rest.parse::<u32>().is_ok()
-    } else {
-        false
+    pub(crate) fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
+        let flushed_blknums = 0..self.len / PAGE_SZ as u64;
+        if flushed_blknums.contains(&(blknum as u64)) {
+            let cache = page_cache::get();
+            loop {
+                match cache
+                    .read_immutable_buf(self.page_cache_file_id, blknum)
+                    .map_err(|e| {
+                        std::io::Error::new(
+                            std::io::ErrorKind::Other,
+                            // order path before error because error is anyhow::Error => might have many contexts
+                            format!(
+                                "ephemeral file: read immutable page #{}: {}: {:#}",
+                                blknum,
+                                self.file.path.display(),
+                                e,
+                            ),
+                        )
+                    })? {
+                    page_cache::ReadBufResult::Found(guard) => {
+                        return Ok(BlockLease::PageReadGuard(guard))
+                    }
+                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
+                        let buf: &mut [u8] = write_guard.deref_mut();
+                        debug_assert_eq!(buf.len(), PAGE_SZ);
+                        self.file
+                            .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)?;
+                        write_guard.mark_valid();
+
+                        // Swap for read lock
+                        continue;
+                    }
+                };
+            }
+        } else {
+            debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
+            Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
+        }
    }
-}

-impl BlobWriter for EphemeralFile {
-    fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
+    pub(crate) async fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
        struct Writer<'a> {
            ephemeral_file: &'a mut EphemeralFile,
            /// The block to which the next [`push_bytes`] will write.
@@ -84,13 +112,13 @@ impl BlobWriter for EphemeralFile {
        impl<'a> Writer<'a> {
            fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
                Ok(Writer {
-                    blknum: (ephemeral_file.size / PAGE_SZ as u64) as u32,
-                    off: (ephemeral_file.size % PAGE_SZ as u64) as usize,
+                    blknum: (ephemeral_file.len / PAGE_SZ as u64) as u32,
+                    off: (ephemeral_file.len % PAGE_SZ as u64) as usize,
                    ephemeral_file,
                })
            }
            #[inline(always)]
-            fn push_bytes(&mut self, src: &[u8]) -> Result<(), io::Error> {
+            async fn push_bytes(&mut self, src: &[u8]) -> Result<(), io::Error> {
                let mut src_remaining = src;
                while !src_remaining.is_empty() {
                    let dst_remaining = &mut self.ephemeral_file.mutable_tail[self.off..];
@@ -154,34 +182,43 @@ impl BlobWriter for EphemeralFile {
            }
        }

-        let pos = self.size;
+        let pos = self.len;
        let mut writer = Writer::new(self)?;

        // Write the length field
        if srcbuf.len() < 0x80 {
            // short one-byte length header
            let len_buf = [srcbuf.len() as u8];
-            writer.push_bytes(&len_buf)?;
+            writer.push_bytes(&len_buf).await?;
        } else {
            let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
            len_buf[0] |= 0x80;
-            writer.push_bytes(&len_buf)?;
+            writer.push_bytes(&len_buf).await?;
        }

        // Write the payload
-        writer.push_bytes(srcbuf)?;
+        writer.push_bytes(srcbuf).await?;

        if srcbuf.len() < 0x80 {
-            self.size += 1;
+            self.len += 1;
        } else {
-            self.size += 4;
+            self.len += 4;
        }
-        self.size += srcbuf.len() as u64;
+        self.len += srcbuf.len() as u64;

        Ok(pos)
    }
 }

+/// Does the given filename look like an ephemeral file?
+pub fn is_ephemeral_file(filename: &str) -> bool {
+    if let Some(rest) = filename.strip_prefix("ephemeral-") {
+        rest.parse::<u32>().is_ok()
+    } else {
+        false
+    }
+}
+
 impl Drop for EphemeralFile {
    fn drop(&mut self) {
        // drop all pages from page cache
@@ -207,52 +244,15 @@ impl Drop for EphemeralFile {
 }

 impl BlockReader for EphemeralFile {
-    fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
-        let flushed_blknums = 0..self.size / PAGE_SZ as u64;
-        if flushed_blknums.contains(&(blknum as u64)) {
-            let cache = page_cache::get();
-            loop {
-                match cache
-                    .read_immutable_buf(self.page_cache_file_id, blknum)
-                    .map_err(|e| {
-                        std::io::Error::new(
-                            std::io::ErrorKind::Other,
-                            // order path before error because error is anyhow::Error => might have many contexts
-                            format!(
-                                "ephemeral file: read immutable page #{}: {}: {:#}",
-                                blknum,
-                                self.file.path.display(),
-                                e,
-                            ),
-                        )
-                    })? {
-                    page_cache::ReadBufResult::Found(guard) => {
-                        return Ok(BlockLease::PageReadGuard(guard))
-                    }
-                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                        let buf: &mut [u8] = write_guard.deref_mut();
-                        debug_assert_eq!(buf.len(), PAGE_SZ);
-                        self.file
-                            .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)?;
-                        write_guard.mark_valid();
-
-                        // Swap for read lock
-                        continue;
-                    }
-                };
-            }
-        } else {
-            debug_assert_eq!(blknum as u64, self.size / PAGE_SZ as u64);
-            Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
-        }
+    fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
+        BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
    }
 }

 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::tenant::blob_io::BlobWriter;
-    use crate::tenant::block_io::BlockCursor;
+    use crate::tenant::block_io::{BlockCursor, BlockReaderRef};
    use rand::{thread_rng, RngCore};
    use std::fs;
    use std::str::FromStr;
@@ -280,12 +280,12 @@ mod tests {

        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;

-        let pos_foo = file.write_blob(b"foo")?;
+        let pos_foo = file.write_blob(b"foo").await?;
        assert_eq!(
            b"foo",
            file.block_cursor().read_blob(pos_foo).await?.as_slice()
        );
-        let pos_bar = file.write_blob(b"bar")?;
+        let pos_bar = file.write_blob(b"bar").await?;
        assert_eq!(
            b"foo",
            file.block_cursor().read_blob(pos_foo).await?.as_slice()
@@ -298,17 +298,17 @@ mod tests {
        let mut blobs = Vec::new();
        for i in 0..10000 {
            let data = Vec::from(format!("blob{}", i).as_bytes());
-            let pos = file.write_blob(&data)?;
+            let pos = file.write_blob(&data).await?;
            blobs.push((pos, data));
        }
        // also test with a large blobs
        for i in 0..100 {
            let data = format!("blob{}", i).as_bytes().repeat(100);
-            let pos = file.write_blob(&data)?;
+            let pos = file.write_blob(&data).await?;
            blobs.push((pos, data));
        }

-        let cursor = BlockCursor::new(&file);
+        let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file));
        for (pos, expected) in blobs {
            let actual = cursor.read_blob(pos).await?;
            assert_eq!(actual, expected);
@@ -318,7 +318,7 @@ mod tests {
        let mut large_data = Vec::new();
        large_data.resize(20000, 0);
        thread_rng().fill_bytes(&mut large_data);
-        let pos_large = file.write_blob(&large_data)?;
+        let pos_large = file.write_blob(&large_data).await?;
        let result = file.block_cursor().read_blob(pos_large).await?;
        assert_eq!(result, large_data);

--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -50,7 +50,6 @@ use crate::context::RequestContext;
 use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
-use crate::tenant::storage_layer::Layer;
 use anyhow::Result;
 use std::collections::VecDeque;
 use std::ops::Range;
@@ -640,147 +639,10 @@ impl LayerMap {
        }

        println!("historic_layers:");
-        for layer in self.iter_historic_layers() {
-            layer.dump(verbose, ctx)?;
+        for desc in self.iter_historic_layers() {
+            desc.dump();
        }
        println!("End dump LayerMap");
        Ok(())
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use super::LayerMap;
-    use crate::tenant::storage_layer::LayerFileName;
-    use std::str::FromStr;
-    use std::sync::Arc;
-
-    mod l0_delta_layers_updated {
-
-        use crate::tenant::{
-            storage_layer::{AsLayerDesc, PersistentLayerDesc},
-            timeline::layer_manager::LayerFileManager,
-        };
-
-        use super::*;
-
-        struct LayerObject(PersistentLayerDesc);
-
-        impl AsLayerDesc for LayerObject {
-            fn layer_desc(&self) -> &PersistentLayerDesc {
-                &self.0
-            }
-        }
-
-        impl LayerObject {
-            fn new(desc: PersistentLayerDesc) -> Self {
-                LayerObject(desc)
-            }
-        }
-
-        type TestLayerFileManager = LayerFileManager<LayerObject>;
-
-        #[test]
-        fn for_full_range_delta() {
-            // l0_delta_layers are used by compaction, and should observe all buffered updates
-            l0_delta_layers_updated_scenario(
-                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
-                 true
-             )
-        }
-
-        #[test]
-        fn for_non_full_range_delta() {
-            // has minimal uncovered areas compared to l0_delta_layers_updated_on_insert_replace_remove_for_full_range_delta
-            l0_delta_layers_updated_scenario(
-                 "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
-                 // because not full range
-                 false
-             )
-        }
-
-        #[test]
-        fn for_image() {
-            l0_delta_layers_updated_scenario(
-                 "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
-                 // code only checks if it is a full range layer, doesn't care about images, which must
-                 // mean we should in practice never have full range images
-                 false
-             )
-        }
-
-        #[test]
-        fn replacing_missing_l0_is_notfound() {
-            // original impl had an oversight, and L0 was an anyhow::Error. anyhow::Error should
-            // however only happen for precondition failures.
-
-            let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
-            let layer = LayerFileName::from_str(layer).unwrap();
-            let layer = PersistentLayerDesc::from(layer);
-
-            // same skeletan construction; see scenario below
-            let not_found = Arc::new(LayerObject::new(layer.clone()));
-            let new_version = Arc::new(LayerObject::new(layer));
-
-            // after the immutable storage state refactor, the replace operation
-            // will not use layer map any more. We keep it here for consistency in test cases
-            // and can remove it in the future.
-            let _map = LayerMap::default();
-
-            let mut mapping = TestLayerFileManager::new();
-
-            mapping
-                .replace_and_verify(not_found, new_version)
-                .unwrap_err();
-        }
-
-        fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
-            let name = LayerFileName::from_str(layer_name).unwrap();
-            let skeleton = PersistentLayerDesc::from(name);
-
-            let remote = Arc::new(LayerObject::new(skeleton.clone()));
-            let downloaded = Arc::new(LayerObject::new(skeleton));
-
-            let mut map = LayerMap::default();
-            let mut mapping = LayerFileManager::new();
-
-            // two disjoint Arcs in different lifecycle phases. even if it seems they must be the
-            // same layer, we use LayerMap::compare_arced_layers as the identity of layers.
-            assert_eq!(remote.layer_desc(), downloaded.layer_desc());
-
-            let expected_in_counts = (1, usize::from(expected_l0));
-
-            map.batch_update()
-                .insert_historic(remote.layer_desc().clone());
-            mapping.insert(remote.clone());
-            assert_eq!(
-                count_layer_in(&map, remote.layer_desc()),
-                expected_in_counts
-            );
-
-            mapping
-                .replace_and_verify(remote, downloaded.clone())
-                .expect("name derived attributes are the same");
-            assert_eq!(
-                count_layer_in(&map, downloaded.layer_desc()),
-                expected_in_counts
-            );
-
-            map.batch_update().remove_historic(downloaded.layer_desc());
-            assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
-        }
-
-        fn count_layer_in(map: &LayerMap, layer: &PersistentLayerDesc) -> (usize, usize) {
-            let historic = map
-                .iter_historic_layers()
-                .filter(|x| x.key() == layer.key())
-                .count();
-            let l0s = map
-                .get_level0_deltas()
-                .expect("why does this return a result");
-            let l0 = l0s.iter().filter(|x| x.key() == layer.key()).count();
-
-            (historic, l0)
-        }
-    }
-}
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -12,7 +12,7 @@ use std::fs::{File, OpenOptions};
 use std::io::{self, Write};

 use anyhow::{bail, ensure, Context};
-use serde::{Deserialize, Serialize};
+use serde::{de::Error, Deserialize, Serialize, Serializer};
 use thiserror::Error;
 use tracing::info_span;
 use utils::bin_ser::SerializeError;
@@ -232,6 +232,28 @@ impl TimelineMetadata {
    }
 }

+impl<'de> Deserialize<'de> for TimelineMetadata {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        let bytes = Vec::<u8>::deserialize(deserializer)?;
+        Self::from_bytes(bytes.as_slice()).map_err(|e| D::Error::custom(format!("{e}")))
+    }
+}
+
+impl Serialize for TimelineMetadata {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        let bytes = self
+            .to_bytes()
+            .map_err(|e| serde::ser::Error::custom(format!("{e}")))?;
+        bytes.serialize(serializer)
+    }
+}
+
 /// Save timeline metadata to file
 pub fn save_metadata(
    conf: &'static PageServerConf,
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -135,7 +135,7 @@
 //! - Initiate upload queue with that [`IndexPart`].
 //! - Reschedule all lost operations by comparing the local filesystem state
 //!   and remote state as per [`IndexPart`]. This is done in
-//!   [`Tenant::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
+//!   [`Tenant::timeline_init_and_sync`].
 //!
 //! Note that if we crash during file deletion between the index update
 //! that removes the file from the list of files, and deleting the remote file,
@@ -163,8 +163,6 @@
 //!   - download their remote [`IndexPart`]s
 //!   - create `Timeline` struct and a `RemoteTimelineClient`
 //!   - initialize the client's upload queue with its `IndexPart`
-//!   - create [`RemoteLayer`](super::storage_layer::RemoteLayer) instances
-//!     for layers that are referenced by `IndexPart` but not present locally
 //!   - schedule uploads for layers that are only present locally.
 //!   - if the remote `IndexPart`'s metadata was newer than the metadata in
 //!     the local filesystem, write the remote metadata to the local filesystem
@@ -172,7 +170,6 @@
 //!   transitioning it from `TenantState::Attaching` to `TenantState::Active` state.
 //!   This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops.
 //!
-//! Most of the above steps happen in [`Timeline::reconcile_with_remote`] or its callers.
 //! We keep track of the fact that a client is in `Attaching` state in a marker
 //! file on the local disk. This is critical because, when we restart the pageserver,
 //! we do not want to do the `List timelines` step for each tenant that has already
@@ -192,14 +189,14 @@
 //! not created and the uploads are skipped.
 //! Theoretically, it should be ok to remove and re-add remote storage configuration to
 //! the pageserver config at any time, since it doesn't make a difference to
-//! `reconcile_with_remote`.
+//! [`Timeline::load_layer_map`].
 //! Of course, the remote timeline dir must not change while we have de-configured
 //! remote storage, i.e., the pageserver must remain the owner of the given prefix
 //! in remote storage.
 //! But note that we don't test any of this right now.
 //!
 //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
-//! [`Timeline::reconcile_with_remote`]: super::Timeline::reconcile_with_remote
+//! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map

 mod delete;
 mod download;
@@ -211,6 +208,7 @@ use chrono::{NaiveDateTime, Utc};
 // re-export these
 pub use download::{is_temp_download_file, list_remote_timelines};
 use scopeguard::ScopeGuard;
+use tokio_util::sync::CancellationToken;
 use utils::backoff::{
    self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
@@ -231,8 +229,10 @@ use crate::metrics::{
    RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
    REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
 };
+use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+pub(crate) use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::storage_layer::AsLayerDesc;
 use crate::tenant::upload_queue::Delete;
 use crate::{
    config::PageServerConf,
@@ -249,7 +249,7 @@ use utils::id::{TenantId, TimelineId};

 use self::index::IndexPart;

-use super::storage_layer::LayerFileName;
+use super::storage_layer::{LayerFileName, ResidentLayer};
 use super::upload_queue::SetDeletedFlagProgress;

 // Occasional network issues and such can cause remote operations to fail, and
@@ -353,6 +353,10 @@ impl RemoteTimelineClient {
        let mut upload_queue = self.upload_queue.lock().unwrap();
        upload_queue.initialize_with_current_remote_index_part(index_part)?;
        self.update_remote_physical_size_gauge(Some(index_part));
+        info!(
+            "initialized upload queue from remote index with {} layer files",
+            index_part.layer_metadata.len()
+        );
        Ok(())
    }

@@ -365,6 +369,7 @@ impl RemoteTimelineClient {
        let mut upload_queue = self.upload_queue.lock().unwrap();
        upload_queue.initialize_empty_remote(local_metadata)?;
        self.update_remote_physical_size_gauge(None);
+        info!("initialized upload queue as empty");
        Ok(())
    }

@@ -535,8 +540,7 @@ impl RemoteTimelineClient {
        // ahead of what's _actually_ on the remote during index upload.
        upload_queue.latest_metadata = metadata.clone();

-        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
-        self.schedule_index_upload(upload_queue, metadata_bytes);
+        self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());

        Ok(())
    }
@@ -556,8 +560,7 @@ impl RemoteTimelineClient {
        let upload_queue = guard.initialized_mut()?;

        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
-            self.schedule_index_upload(upload_queue, metadata_bytes);
+            self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
        }

        Ok(())
@@ -567,7 +570,7 @@ impl RemoteTimelineClient {
    fn schedule_index_upload(
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
-        metadata_bytes: Vec<u8>,
+        metadata: TimelineMetadata,
    ) {
        info!(
            "scheduling metadata upload with {} files ({} changed)",
@@ -580,7 +583,7 @@ impl RemoteTimelineClient {
        let index_part = IndexPart::new(
            upload_queue.latest_files.clone(),
            disk_consistent_lsn,
-            metadata_bytes,
+            metadata,
        );
        let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
        self.calls_unfinished_metric_begin(&op);
@@ -594,25 +597,25 @@ impl RemoteTimelineClient {
    ///
    /// Launch an upload operation in the background.
    ///
-    pub fn schedule_layer_file_upload(
+    pub(crate) fn schedule_layer_file_upload(
        self: &Arc<Self>,
-        layer_file_name: &LayerFileName,
-        layer_metadata: &LayerFileMetadata,
+        layer: ResidentLayer,
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

+        let metadata = LayerFileMetadata::new(layer.layer_desc().file_size);
+
        upload_queue
            .latest_files
-            .insert(layer_file_name.clone(), layer_metadata.clone());
+            .insert(layer.layer_desc().filename(), metadata.clone());
        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;

-        let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
+        info!("scheduled layer file upload {layer}");
+        let op = UploadOp::UploadLayer(layer, metadata);
        self.calls_unfinished_metric_begin(&op);
        upload_queue.queued_operations.push_back(op);

-        info!("scheduled layer file upload {layer_file_name}");
-
        // Launch the task immediately, if possible
        self.launch_queued_tasks(upload_queue);
        Ok(())
@@ -636,7 +639,7 @@ impl RemoteTimelineClient {

        // Deleting layers doesn't affect the values stored in TimelineMetadata,
        // so we don't need update it. Just serialize it.
-        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
+        let metadata = upload_queue.latest_metadata.clone();

        // Update the remote index file, removing the to-be-deleted files from the index,
        // before deleting the actual files.
@@ -647,12 +650,13 @@ impl RemoteTimelineClient {
        // to syntactically forbid ? or bail! calls here.
        let no_bail_here = || {
            for name in names {
-                upload_queue.latest_files.remove(name);
-                upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
+                if upload_queue.latest_files.remove(name).is_some() {
+                    upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
+                }
            }

            if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-                self.schedule_index_upload(upload_queue, metadata_bytes);
+                self.schedule_index_upload(upload_queue, metadata);
            }

            // schedule the actual deletions
@@ -754,7 +758,7 @@ impl RemoteTimelineClient {
        pausable_failpoint!("persist_deleted_index_part");

        backoff::retry(
-            || async {
+            || {
                upload::upload_index_part(
                    self.conf,
                    &self.storage_impl,
@@ -762,7 +766,6 @@ impl RemoteTimelineClient {
                    &self.timeline_id,
                    &index_part_with_deleted_at,
                )
-                .await
            },
            |_e| false,
            1,
@@ -771,6 +774,8 @@ impl RemoteTimelineClient {
            // when executed as part of tenant deletion this happens in the background
            2,
            "persist_index_part_with_deleted_flag",
+            // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
        )
        .await?;

@@ -857,6 +862,7 @@ impl RemoteTimelineClient {
            FAILED_DOWNLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "list_prefixes",
+            backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
        )
        .await
        .context("list prefixes")?;
@@ -880,6 +886,7 @@ impl RemoteTimelineClient {
                FAILED_UPLOAD_WARN_THRESHOLD,
                FAILED_REMOTE_OP_RETRIES,
                "delete_objects",
+                backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
            )
            .await
            .context("delete_objects")?;
@@ -901,6 +908,7 @@ impl RemoteTimelineClient {
            FAILED_UPLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "delete_index",
+            backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled")),
        )
        .await
        .context("delete_index")?;
@@ -1045,11 +1053,8 @@ impl RemoteTimelineClient {
            }

            let upload_result: anyhow::Result<()> = match &task.op {
-                UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => {
-                    let path = &self
-                        .conf
-                        .timeline_path(&self.tenant_id, &self.timeline_id)
-                        .join(layer_file_name.file_name());
+                UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
+                    let path = layer.local_path();
                    upload::upload_timeline_layer(
                        self.conf,
                        &self.storage_impl,
@@ -1066,6 +1071,15 @@ impl RemoteTimelineClient {
                    .await
                }
                UploadOp::UploadMetadata(ref index_part, _lsn) => {
+                    let mention_having_future_layers = if cfg!(feature = "testing") {
+                        index_part
+                            .layer_metadata
+                            .keys()
+                            .any(|x| x.is_in_future(*_lsn))
+                    } else {
+                        false
+                    };
+
                    let res = upload::upload_index_part(
                        self.conf,
                        &self.storage_impl,
@@ -1083,6 +1097,10 @@ impl RemoteTimelineClient {
                    .await;
                    if res.is_ok() {
                        self.update_remote_physical_size_gauge(Some(index_part));
+                        if mention_having_future_layers {
+                            // find rationale near crate::tenant::timeline::init::cleanup_future_layer
+                            tracing::info!(disk_consistent_lsn=%_lsn, "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup");
+                        }
                    }
                    res
                }
@@ -1134,14 +1152,13 @@ impl RemoteTimelineClient {
                    }

                    // sleep until it's time to retry, or we're cancelled
-                    tokio::select! {
-                        _ = task_mgr::shutdown_watcher() => { },
-                        _ = exponential_backoff(
-                            retries,
-                            DEFAULT_BASE_BACKOFF_SECONDS,
-                            DEFAULT_MAX_BACKOFF_SECONDS,
-                        ) => { },
-                    };
+                    exponential_backoff(
+                        retries,
+                        DEFAULT_BASE_BACKOFF_SECONDS,
+                        DEFAULT_MAX_BACKOFF_SECONDS,
+                        &shutdown_token(),
+                    )
+                    .await;
                }
            }
        }
@@ -1346,6 +1363,7 @@ mod tests {
        context::RequestContext,
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
+            storage_layer::Layer,
            Tenant, Timeline,
        },
        DEFAULT_PG_VERSION,
@@ -1486,7 +1504,7 @@ mod tests {
        let TestSetup {
            harness,
            tenant: _tenant,
-            timeline: _timeline,
+            timeline,
            tenant_ctx: _tenant_ctx,
            remote_fs_dir,
            client,
@@ -1506,32 +1524,29 @@ mod tests {
            .unwrap();

        // Create a couple of dummy files,  schedule upload for them
-        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
-        let layer_file_name_2: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap();
-        let layer_file_name_3: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap();
-        let content_1 = dummy_contents("foo");
-        let content_2 = dummy_contents("bar");
-        let content_3 = dummy_contents("baz");

-        for (filename, content) in [
-            (&layer_file_name_1, &content_1),
-            (&layer_file_name_2, &content_2),
-            (&layer_file_name_3, &content_3),
-        ] {
-            std::fs::write(timeline_path.join(filename.file_name()), content).unwrap();
-        }
+        let layers = [
+            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), dummy_contents("foo")),
+            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap(), dummy_contents("bar")),
+            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz"))
+        ]
+        .into_iter()
+        .map(|(name, contents): (LayerFileName, Vec<u8>)| {
+            std::fs::write(timeline_path.join(name.file_name()), &contents).unwrap();
+
+            Layer::for_resident(
+                harness.conf,
+                &timeline,
+                name,
+                LayerFileMetadata::new(contents.len() as u64),
+            )
+        }).collect::<Vec<_>>();

        client
-            .schedule_layer_file_upload(
-                &layer_file_name_1,
-                &LayerFileMetadata::new(content_1.len() as u64),
-            )
+            .schedule_layer_file_upload(layers[0].clone())
            .unwrap();
        client
-            .schedule_layer_file_upload(
-                &layer_file_name_2,
-                &LayerFileMetadata::new(content_2.len() as u64),
-            )
+            .schedule_layer_file_upload(layers[1].clone())
            .unwrap();

        // Check that they are started immediately, not queued
@@ -1584,22 +1599,18 @@ mod tests {
                .map(|f| f.to_owned())
                .collect(),
            &[
-                &layer_file_name_1.file_name(),
-                &layer_file_name_2.file_name(),
+                &layers[0].layer_desc().filename().file_name(),
+                &layers[1].layer_desc().filename().file_name(),
            ],
        );
-        let downloaded_metadata = index_part.parse_metadata().unwrap();
-        assert_eq!(downloaded_metadata, metadata);
+        assert_eq!(index_part.metadata, metadata);

        // Schedule upload and then a deletion. Check that the deletion is queued
        client
-            .schedule_layer_file_upload(
-                &layer_file_name_3,
-                &LayerFileMetadata::new(content_3.len() as u64),
-            )
+            .schedule_layer_file_upload(layers[2].clone())
            .unwrap();
        client
-            .schedule_layer_file_deletion(&[layer_file_name_1.clone()])
+            .schedule_layer_file_deletion(&[layers[0].layer_desc().filename()])
            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
@@ -1614,8 +1625,8 @@ mod tests {
        }
        assert_remote_files(
            &[
-                &layer_file_name_1.file_name(),
-                &layer_file_name_2.file_name(),
+                &layers[0].layer_desc().filename().file_name(),
+                &layers[1].layer_desc().filename().file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
@@ -1626,8 +1637,8 @@ mod tests {

        assert_remote_files(
            &[
-                &layer_file_name_2.file_name(),
-                &layer_file_name_3.file_name(),
+                &layers[1].layer_desc().filename().file_name(),
+                &layers[2].layer_desc().filename().file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
@@ -1641,7 +1652,7 @@ mod tests {
        let TestSetup {
            harness,
            tenant: _tenant,
-            timeline: _timeline,
+            timeline,
            client,
            ..
        } = TestSetup::new("metrics").await.unwrap();
@@ -1661,6 +1672,13 @@ mod tests {
        )
        .unwrap();

+        let layer_file_1 = Layer::for_resident(
+            harness.conf,
+            &timeline,
+            layer_file_name_1.clone(),
+            LayerFileMetadata::new(content_1.len() as u64),
+        );
+
        #[derive(Debug, PartialEq)]
        struct BytesStartedFinished {
            started: Option<usize>,
@@ -1686,10 +1704,7 @@ mod tests {
        let init = get_bytes_started_stopped();

        client
-            .schedule_layer_file_upload(
-                &layer_file_name_1,
-                &LayerFileMetadata::new(content_1.len() as u64),
-            )
+            .schedule_layer_file_upload(layer_file_1.clone())
            .unwrap();

        let pre = get_bytes_started_stopped();
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -11,6 +11,7 @@ use std::time::Duration;
 use anyhow::{anyhow, Context};
 use tokio::fs;
 use tokio::io::AsyncWriteExt;
+use tokio_util::sync::CancellationToken;
 use utils::{backoff, crashsafe};

 use crate::config::PageServerConf;
@@ -280,6 +281,10 @@ where
        FAILED_DOWNLOAD_WARN_THRESHOLD,
        FAILED_REMOTE_OP_RETRIES,
        description,
+        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+        backoff::Cancel::new(CancellationToken::new(), || -> DownloadError {
+            unreachable!()
+        }),
    )
    .await
 }
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -77,7 +77,9 @@ pub struct IndexPart {
    // private because internally we would read from metadata instead.
    #[serde_as(as = "DisplayFromStr")]
    disk_consistent_lsn: Lsn,
-    metadata_bytes: Vec<u8>,
+
+    #[serde(rename = "metadata_bytes")]
+    pub metadata: TimelineMetadata,
 }

 impl IndexPart {
@@ -95,7 +97,7 @@ impl IndexPart {
    pub fn new(
        layers_and_metadata: HashMap<LayerFileName, LayerFileMetadata>,
        disk_consistent_lsn: Lsn,
-        metadata_bytes: Vec<u8>,
+        metadata: TimelineMetadata,
    ) -> Self {
        let mut timeline_layers = HashSet::with_capacity(layers_and_metadata.len());
        let mut layer_metadata = HashMap::with_capacity(layers_and_metadata.len());
@@ -111,14 +113,10 @@ impl IndexPart {
            timeline_layers,
            layer_metadata,
            disk_consistent_lsn,
-            metadata_bytes,
+            metadata,
            deleted_at: None,
        }
    }
-
-    pub fn parse_metadata(&self) -> anyhow::Result<TimelineMetadata> {
-        TimelineMetadata::from_bytes(&self.metadata_bytes)
-    }
 }

 impl TryFrom<&UploadQueueInitialized> for IndexPart {
@@ -126,12 +124,12 @@ impl TryFrom<&UploadQueueInitialized> for IndexPart {

    fn try_from(upload_queue: &UploadQueueInitialized) -> Result<Self, Self::Error> {
        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
-        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
+        let metadata = upload_queue.latest_metadata.clone();

        Ok(Self::new(
            upload_queue.latest_files.clone(),
            disk_consistent_lsn,
-            metadata_bytes,
+            metadata,
        ))
    }
 }
@@ -182,7 +180,7 @@ mod tests {
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: None,
        };

@@ -201,7 +199,7 @@ mod tests {
                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
            },
            "disk_consistent_lsn":"0/16960E8",
-            "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
        }"#;

        let expected = IndexPart {
@@ -219,7 +217,7 @@ mod tests {
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: None,
        };

@@ -238,7 +236,7 @@ mod tests {
                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
            },
            "disk_consistent_lsn":"0/16960E8",
-            "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
            "deleted_at": "2023-07-31T09:00:00.123"
        }"#;

@@ -257,7 +255,7 @@ mod tests {
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
        };
@@ -281,7 +279,7 @@ mod tests {
            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::new(),
            disk_consistent_lsn: "0/2532648".parse::<Lsn>().unwrap(),
-            metadata_bytes: [
+            metadata: TimelineMetadata::from_bytes(&[
                136, 151, 49, 208, 0, 70, 0, 4, 0, 0, 0, 0, 2, 83, 38, 72, 1, 0, 0, 0, 0, 2, 83,
                38, 32, 1, 87, 198, 240, 135, 97, 119, 45, 125, 38, 29, 155, 161, 140, 141, 255,
                210, 0, 0, 0, 0, 2, 83, 38, 72, 0, 0, 0, 0, 1, 73, 240, 192, 0, 0, 0, 0, 1, 73,
@@ -302,8 +300,8 @@ mod tests {
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0,
-            ]
-            .to_vec(),
+            ])
+            .unwrap(),
            deleted_at: None,
        };

--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -67,6 +67,8 @@ pub(super) async fn upload_timeline_layer<'a>(
            // upload. However, a nonexistent file can also be indicative of
            // something worse, like when a file is scheduled for upload before
            // it has been written to disk yet.
+            //
+            // This is tested against `test_compaction_delete_before_upload`
            info!(path = %source_path.display(), "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
            return Ok(());
        }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -4,26 +4,21 @@ pub mod delta_layer;
 mod filename;
 mod image_layer;
 mod inmemory_layer;
+mod layer;
 mod layer_desc;
-mod remote_layer;

-use crate::config::PageServerConf;
 use crate::context::{AccessStatsBehavior, RequestContext};
-use crate::repository::Key;
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
-use anyhow::Result;
 use bytes::Bytes;
 use enum_map::EnumMap;
 use enumset::EnumSet;
 use once_cell::sync::Lazy;
-use pageserver_api::models::LayerAccessKind;
 use pageserver_api::models::{
-    HistoricLayerInfo, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
+    LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
 };
 use std::ops::Range;
-use std::path::PathBuf;
-use std::sync::{Arc, Mutex};
+use std::sync::Mutex;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 use tracing::warn;
 use utils::history_buffer::HistoryBufferWithDropCounter;
@@ -39,9 +34,8 @@ pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
 pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
-pub use remote_layer::RemoteLayer;

-use super::timeline::layer_manager::LayerManager;
+pub(crate) use layer::{EvictionError, Layer, ResidentLayer};

 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
@@ -76,7 +70,7 @@ pub struct ValueReconstructState {
    pub img: Option<(Lsn, Bytes)>,
 }

-/// Return value from Layer::get_page_reconstruct_data
+/// Return value from [`Layer::get_value_reconstruct_data`]
 #[derive(Clone, Copy, Debug)]
 pub enum ValueReconstructResult {
    /// Got all the data needed to reconstruct the requested page
@@ -175,41 +169,9 @@ impl LayerAccessStats {
    ///
    /// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad
    /// [`record_residence_event`]: Self::record_residence_event
-    pub(crate) fn for_loading_layer(
-        layer_map_lock_held_witness: &LayerManager,
-        status: LayerResidenceStatus,
-    ) -> Self {
+    pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
        let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
-        new.record_residence_event(
-            layer_map_lock_held_witness,
-            status,
-            LayerResidenceEventReason::LayerLoad,
-        );
-        new
-    }
-
-    /// Creates a clone of `self` and records `new_status` in the clone.
-    ///
-    /// The `new_status` is not recorded in `self`.
-    ///
-    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
-    ///
-    /// [`record_residence_event`]: Self::record_residence_event
-    pub(crate) fn clone_for_residence_change(
-        &self,
-        layer_map_lock_held_witness: &LayerManager,
-        new_status: LayerResidenceStatus,
-    ) -> LayerAccessStats {
-        let clone = {
-            let inner = self.0.lock().unwrap();
-            inner.clone()
-        };
-        let new = LayerAccessStats(Mutex::new(clone));
-        new.record_residence_event(
-            layer_map_lock_held_witness,
-            new_status,
-            LayerResidenceEventReason::ResidenceChange,
-        );
+        new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
        new
    }

@@ -229,7 +191,6 @@ impl LayerAccessStats {
    ///
    pub(crate) fn record_residence_event(
        &self,
-        _layer_map_lock_held_witness: &LayerManager,
        status: LayerResidenceStatus,
        reason: LayerResidenceEventReason,
    ) {
@@ -336,115 +297,12 @@ impl LayerAccessStats {
    }
 }

-/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
-/// required by [`LayerMap`](super::layer_map::LayerMap).
-///
-/// All layers should implement a minimal `std::fmt::Debug` without tenant or
-/// timeline names, because those are known in the context of which the layers
-/// are used in (timeline).
-#[async_trait::async_trait]
-pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
-    /// Range of keys that this layer covers
-    fn get_key_range(&self) -> Range<Key>;
-
-    /// Inclusive start bound of the LSN range that this layer holds
-    /// Exclusive end bound of the LSN range that this layer holds.
-    ///
-    /// - For an open in-memory layer, this is MAX_LSN.
-    /// - For a frozen in-memory layer or a delta layer, this is a valid end bound.
-    /// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
-    fn get_lsn_range(&self) -> Range<Lsn>;
-
-    /// Does this layer only contain some data for the key-range (incremental),
-    /// or does it contain a version of every page? This is important to know
-    /// for garbage collecting old layers: an incremental layer depends on
-    /// the previous non-incremental layer.
-    fn is_incremental(&self) -> bool;
-
-    ///
-    /// Return data needed to reconstruct given page at LSN.
-    ///
-    /// It is up to the caller to collect more data from previous layer and
-    /// perform WAL redo, if necessary.
-    ///
-    /// See PageReconstructResult for possible return values. The collected data
-    /// is appended to reconstruct_data; the caller should pass an empty struct
-    /// on first call, or a struct with a cached older image of the page if one
-    /// is available. If this returns ValueReconstructResult::Continue, look up
-    /// the predecessor layer and call again with the same 'reconstruct_data' to
-    /// collect more data.
-    async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_data: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> Result<ValueReconstructResult>;
-
-    /// Dump summary of the contents of the layer to stdout
-    async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
-}
-
 /// Get a layer descriptor from a layer.
 pub trait AsLayerDesc {
    /// Get the layer descriptor.
    fn layer_desc(&self) -> &PersistentLayerDesc;
 }

-/// A Layer contains all data in a "rectangle" consisting of a range of keys and
-/// range of LSNs.
-///
-/// There are two kinds of layers, in-memory and on-disk layers. In-memory
-/// layers are used to ingest incoming WAL, and provide fast access to the
-/// recent page versions. On-disk layers are stored as files on disk, and are
-/// immutable. This trait presents the common functionality of in-memory and
-/// on-disk layers.
-///
-/// Furthermore, there are two kinds of on-disk layers: delta and image layers.
-/// A delta layer contains all modifications within a range of LSNs and keys.
-/// An image layer is a snapshot of all the data in a key-range, at a single
-/// LSN.
-pub trait PersistentLayer: Layer + AsLayerDesc {
-    /// File name used for this layer, both in the pageserver's local filesystem
-    /// state as well as in the remote storage.
-    fn filename(&self) -> LayerFileName {
-        self.layer_desc().filename()
-    }
-
-    // Path to the layer file in the local filesystem.
-    // `None` for `RemoteLayer`.
-    fn local_path(&self) -> Option<PathBuf>;
-
-    /// Permanently remove this layer from disk.
-    fn delete_resident_layer_file(&self) -> Result<()>;
-
-    fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
-        None
-    }
-
-    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
-        None
-    }
-
-    fn is_remote_layer(&self) -> bool {
-        false
-    }
-
-    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;
-
-    fn access_stats(&self) -> &LayerAccessStats;
-}
-
-pub fn downcast_remote_layer(
-    layer: &Arc<dyn PersistentLayer>,
-) -> Option<std::sync::Arc<RemoteLayer>> {
-    if layer.is_remote_layer() {
-        Arc::clone(layer).downcast_remote_layer()
-    } else {
-        None
-    }
-}
-
 pub mod tests {
    use super::*;

@@ -467,7 +325,6 @@ pub mod tests {
                TimelineId::from_array([0; 16]),
                value.key_range,
                value.lsn,
-                false,
                233,
            )
        }
@@ -483,19 +340,6 @@ pub mod tests {
    }
 }

-/// Helper enum to hold a PageServerConf, or a path
-///
-/// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the
-/// global config, and paths to layer files are constructed using the tenant/timeline
-/// path from the config. But in the 'pagectl' binary, we need to construct a Layer
-/// struct for a file on disk, without having a page server running, so that we have no
-/// config. In that case, we use the Path variant to hold the full path to the file on
-/// disk.
-enum PathOrConf {
-    Path(PathBuf),
-    Conf(&'static PageServerConf),
-}
-
 /// Range wrapping newtype, which uses display to render Debug.
 ///
 /// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -34,17 +34,16 @@ use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
-use crate::tenant::storage_layer::{
-    PersistentLayer, ValueReconstructResult, ValueReconstructState,
-};
+use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
+use crate::tenant::Timeline;
 use crate::virtual_file::VirtualFile;
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
-use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
+use pageserver_api::models::LayerAccessKind;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs::{self, File};
+use std::fs::File;
 use std::io::{BufWriter, Write};
 use std::io::{Seek, SeekFrom};
 use std::ops::Range;
@@ -60,10 +59,7 @@ use utils::{
    lsn::Lsn,
 };

-use super::{
-    AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, PathOrConf,
-    PersistentLayerDesc,
-};
+use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer};

 ///
 /// Header stored in the beginning of the file
@@ -183,20 +179,12 @@ impl DeltaKey {
    }
 }

-/// DeltaLayer is the in-memory data structure associated with an on-disk delta
-/// file.
-///
-/// We keep a DeltaLayer in memory for each file, in the LayerMap. If a layer
-/// is in "loaded" state, we have a copy of the index in memory, in 'inner'.
-/// Otherwise the struct is just a placeholder for a file that exists on disk,
-/// and it needs to be loaded before using it in queries.
+/// This is used only from `pagectl`. Within pageserver, all layers are
+/// [`crate::tenant::storage_layer::Layer`], which can hold a [`DeltaLayerInner`].
 pub struct DeltaLayer {
-    path_or_conf: PathOrConf,
-
+    path: PathBuf,
    pub desc: PersistentLayerDesc,
-
    access_stats: LayerAccessStats,
-
    inner: OnceCell<Arc<DeltaLayerInner>>,
 }

@@ -213,6 +201,8 @@ impl std::fmt::Debug for DeltaLayer {
    }
 }

+/// `DeltaLayerInner` is the in-memory data structure associated with an on-disk delta
+/// file.
 pub struct DeltaLayerInner {
    // values copied from summary
    index_start_blk: u32,
@@ -222,12 +212,6 @@ pub struct DeltaLayerInner {
    file: FileBlockReader<VirtualFile>,
 }

-impl AsRef<DeltaLayerInner> for DeltaLayerInner {
-    fn as_ref(&self) -> &DeltaLayerInner {
-        self
-    }
-}
-
 impl std::fmt::Debug for DeltaLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("DeltaLayerInner")
@@ -237,115 +221,6 @@ impl std::fmt::Debug for DeltaLayerInner {
    }
 }

-#[async_trait::async_trait]
-impl Layer for DeltaLayer {
-    /// debugging function to print out the contents of the layer
-    async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
-        println!(
-            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
-            self.desc.tenant_id,
-            self.desc.timeline_id,
-            self.desc.key_range.start,
-            self.desc.key_range.end,
-            self.desc.lsn_range.start,
-            self.desc.lsn_range.end,
-            self.desc.file_size,
-        );
-
-        if !verbose {
-            return Ok(());
-        }
-
-        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
-
-        println!(
-            "index_start_blk: {}, root {}",
-            inner.index_start_blk, inner.index_root_blk
-        );
-
-        let file = &inner.file;
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            inner.index_start_blk,
-            inner.index_root_blk,
-            file,
-        );
-
-        tree_reader.dump().await?;
-
-        let keys = DeltaLayerInner::load_keys(&Ref(&**inner)).await?;
-
-        // A subroutine to dump a single blob
-        let dump_blob = |val: ValueRef<_>| -> _ {
-            async move {
-                let buf = val.reader.read_blob(val.blob_ref.pos()).await?;
-                let val = Value::des(&buf)?;
-                let desc = match val {
-                    Value::Image(img) => {
-                        format!(" img {} bytes", img.len())
-                    }
-                    Value::WalRecord(rec) => {
-                        let wal_desc = walrecord::describe_wal_record(&rec)?;
-                        format!(
-                            " rec {} bytes will_init: {} {}",
-                            buf.len(),
-                            rec.will_init(),
-                            wal_desc
-                        )
-                    }
-                };
-                Ok(desc)
-            }
-        };
-
-        for entry in keys {
-            let DeltaEntry { key, lsn, val, .. } = entry;
-            let desc = match dump_blob(val).await {
-                Ok(desc) => desc,
-                Err(err) => {
-                    let err: anyhow::Error = err;
-                    format!("ERROR: {err}")
-                }
-            };
-            println!("  key {key} at {lsn}: {desc}");
-        }
-
-        Ok(())
-    }
-
-    async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        ensure!(lsn_range.start >= self.desc.lsn_range.start);
-
-        ensure!(self.desc.key_range.contains(&key));
-
-        let inner = self
-            .load(LayerAccessKind::GetValueReconstructData, ctx)
-            .await?;
-        inner
-            .get_value_reconstruct_data(key, lsn_range, reconstruct_state)
-            .await
-    }
-
-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn get_key_range(&self) -> Range<Key> {
-        self.layer_desc().key_range.clone()
-    }
-
-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        self.layer_desc().lsn_range.clone()
-    }
-
-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn is_incremental(&self) -> bool {
-        self.layer_desc().is_incremental
-    }
-}
 /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
 impl std::fmt::Display for DeltaLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -359,55 +234,17 @@ impl AsLayerDesc for DeltaLayer {
    }
 }

-impl PersistentLayer for DeltaLayer {
-    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
-        Some(self)
-    }
-
-    fn local_path(&self) -> Option<PathBuf> {
-        Some(self.path())
-    }
-
-    fn delete_resident_layer_file(&self) -> Result<()> {
-        // delete underlying file
-        fs::remove_file(self.path())?;
-        Ok(())
-    }
-
-    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        let layer_file_name = self.filename().file_name();
-        let lsn_range = self.get_lsn_range();
-
-        let access_stats = self.access_stats.as_api_model(reset);
-
-        HistoricLayerInfo::Delta {
-            layer_file_name,
-            layer_file_size: self.desc.file_size,
-            lsn_start: lsn_range.start,
-            lsn_end: lsn_range.end,
-            remote: false,
-            access_stats,
-        }
-    }
-
-    fn access_stats(&self) -> &LayerAccessStats {
-        &self.access_stats
-    }
-}
-
 impl DeltaLayer {
-    fn path_for(
-        path_or_conf: &PathOrConf,
-        tenant_id: &TenantId,
-        timeline_id: &TimelineId,
-        fname: &DeltaFileName,
-    ) -> PathBuf {
-        match path_or_conf {
-            PathOrConf::Path(path) => path.clone(),
-            PathOrConf::Conf(conf) => conf
-                .timeline_path(tenant_id, timeline_id)
-                .join(fname.to_string()),
+    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+        self.desc.dump();
+
+        if !verbose {
+            return Ok(());
        }
+
+        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
+
+        inner.dump().await
    }

    fn temp_path_for(
@@ -453,52 +290,22 @@ impl DeltaLayer {
    async fn load_inner(&self) -> Result<Arc<DeltaLayerInner>> {
        let path = self.path();

-        let summary = match &self.path_or_conf {
-            PathOrConf::Conf(_) => Some(Summary::from(self)),
-            PathOrConf::Path(_) => None,
-        };
+        let loaded = DeltaLayerInner::load(&path, None)?;

-        let loaded = DeltaLayerInner::load(&path, summary)?;
+        // not production code

-        if let PathOrConf::Path(ref path) = self.path_or_conf {
-            // not production code
+        let actual_filename = self.path.file_name().unwrap().to_str().unwrap().to_owned();
+        let expected_filename = self.layer_desc().filename().file_name();

-            let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
-            let expected_filename = self.filename().file_name();
-
-            if actual_filename != expected_filename {
-                println!("warning: filename does not match what is expected from in-file summary");
-                println!("actual: {:?}", actual_filename);
-                println!("expected: {:?}", expected_filename);
-            }
+        if actual_filename != expected_filename {
+            println!("warning: filename does not match what is expected from in-file summary");
+            println!("actual: {:?}", actual_filename);
+            println!("expected: {:?}", expected_filename);
        }

        Ok(Arc::new(loaded))
    }

-    /// Create a DeltaLayer struct representing an existing file on disk.
-    pub fn new(
-        conf: &'static PageServerConf,
-        timeline_id: TimelineId,
-        tenant_id: TenantId,
-        filename: &DeltaFileName,
-        file_size: u64,
-        access_stats: LayerAccessStats,
-    ) -> DeltaLayer {
-        DeltaLayer {
-            path_or_conf: PathOrConf::Conf(conf),
-            desc: PersistentLayerDesc::new_delta(
-                tenant_id,
-                timeline_id,
-                filename.key_range.clone(),
-                filename.lsn_range.clone(),
-                file_size,
-            ),
-            access_stats,
-            inner: OnceCell::new(),
-        }
-    }
-
    /// Create a DeltaLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
@@ -513,7 +320,7 @@ impl DeltaLayer {
            .context("get file metadata to determine size")?;

        Ok(DeltaLayer {
-            path_or_conf: PathOrConf::Path(path.to_path_buf()),
+            path: path.to_path_buf(),
            desc: PersistentLayerDesc::new_delta(
                summary.tenant_id,
                summary.timeline_id,
@@ -526,34 +333,9 @@ impl DeltaLayer {
        })
    }

-    fn layer_name(&self) -> DeltaFileName {
-        self.desc.delta_file_name()
-    }
-    /// Path to the layer file in pageserver workdir.
-    pub fn path(&self) -> PathBuf {
-        Self::path_for(
-            &self.path_or_conf,
-            &self.desc.tenant_id,
-            &self.desc.timeline_id,
-            &self.layer_name(),
-        )
-    }
-    /// Loads all keys stored in the layer. Returns key, lsn, value size and value reference.
-    ///
-    /// The value can be obtained via the [`ValueRef::load`] function.
-    pub(crate) async fn load_keys(
-        &self,
-        ctx: &RequestContext,
-    ) -> Result<Vec<DeltaEntry<Ref<&'_ DeltaLayerInner>>>> {
-        let inner = self
-            .load(LayerAccessKind::KeyIter, ctx)
-            .await
-            .context("load delta layer keys")?;
-
-        let inner = Ref(&**inner);
-        DeltaLayerInner::load_keys(&inner)
-            .await
-            .context("Layer index is corrupted")
+    /// Path to the layer file
+    fn path(&self) -> PathBuf {
+        self.path.clone()
    }
 }

@@ -658,7 +440,7 @@ impl DeltaLayerWriterInner {
    ///
    /// Finish writing the delta layer.
    ///
-    fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
+    fn finish(self, key_end: Key, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -704,37 +486,21 @@ impl DeltaLayerWriterInner {
        // Note: Because we opened the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
-        let layer = DeltaLayer {
-            path_or_conf: PathOrConf::Conf(self.conf),
-            desc: PersistentLayerDesc::new_delta(
-                self.tenant_id,
-                self.timeline_id,
-                self.key_start..key_end,
-                self.lsn_range.clone(),
-                metadata.len(),
-            ),
-            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: OnceCell::new(),
-        };
+
+        let desc = PersistentLayerDesc::new_delta(
+            self.tenant_id,
+            self.timeline_id,
+            self.key_start..key_end,
+            self.lsn_range.clone(),
+            metadata.len(),
+        );

        // fsync the file
        file.sync_all()?;
-        // Rename the file to its final name
-        //
-        // Note: This overwrites any existing file. There shouldn't be any.
-        // FIXME: throw an error instead?
-        let final_path = DeltaLayer::path_for(
-            &PathOrConf::Conf(self.conf),
-            &self.tenant_id,
-            &self.timeline_id,
-            &DeltaFileName {
-                key_range: self.key_start..key_end,
-                lsn_range: self.lsn_range,
-            },
-        );
-        std::fs::rename(self.path, &final_path)?;

-        trace!("created delta layer {}", final_path.display());
+        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
+
+        trace!("created delta layer {}", layer.local_path().display());

        Ok(layer)
    }
@@ -817,8 +583,12 @@ impl DeltaLayerWriter {
    ///
    /// Finish writing the delta layer.
    ///
-    pub fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
-        self.inner.take().unwrap().finish(key_end)
+    pub(crate) fn finish(
+        mut self,
+        key_end: Key,
+        timeline: &Arc<Timeline>,
+    ) -> anyhow::Result<ResidentLayer> {
+        self.inner.take().unwrap().finish(key_end, timeline)
    }
 }

@@ -945,16 +715,16 @@ impl DeltaLayerInner {
        }
    }

-    pub(super) async fn load_keys<T: AsRef<DeltaLayerInner> + Clone>(
-        this: &T,
-    ) -> Result<Vec<DeltaEntry<T>>> {
-        let dl = this.as_ref();
-        let file = &dl.file;
+    pub(super) async fn load_keys(&self) -> Result<Vec<DeltaEntry<'_>>> {
+        let file = &self.file;

-        let tree_reader =
-            DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            file,
+        );

-        let mut all_keys: Vec<DeltaEntry<T>> = Vec::new();
+        let mut all_keys: Vec<DeltaEntry<'_>> = Vec::new();

        tree_reader
            .visit(
@@ -964,7 +734,9 @@ impl DeltaLayerInner {
                    let delta_key = DeltaKey::from_slice(key);
                    let val_ref = ValueRef {
                        blob_ref: BlobRef(value),
-                        reader: BlockCursor::new(Adapter(this.clone())),
+                        reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
+                            Adapter(self),
+                        )),
                    };
                    let pos = BlobRef(value).pos();
                    if let Some(last) = all_keys.last_mut() {
@@ -987,49 +759,80 @@ impl DeltaLayerInner {
        if let Some(last) = all_keys.last_mut() {
            // Last key occupies all space till end of value storage,
            // which corresponds to beginning of the index
-            last.size = dl.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
+            last.size = self.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
        }
        Ok(all_keys)
    }
-}

-/// Cloneable borrow wrapper to make borrows behave like smart pointers.
-///
-/// Shared references are trivially copyable. This wrapper avoids (confusion) to otherwise attempt
-/// cloning DeltaLayerInner.
-pub(crate) struct Ref<T>(T);
+    pub(super) async fn dump(&self) -> anyhow::Result<()> {
+        println!(
+            "index_start_blk: {}, root {}",
+            self.index_start_blk, self.index_root_blk
+        );

-impl<'a, T> AsRef<T> for Ref<&'a T> {
-    fn as_ref(&self) -> &T {
-        self.0
+        let file = &self.file;
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            file,
+        );
+
+        tree_reader.dump().await?;
+
+        let keys = self.load_keys().await?;
+
+        async fn dump_blob(val: ValueRef<'_>) -> anyhow::Result<String> {
+            let buf = val.reader.read_blob(val.blob_ref.pos()).await?;
+            let val = Value::des(&buf)?;
+            let desc = match val {
+                Value::Image(img) => {
+                    format!(" img {} bytes", img.len())
+                }
+                Value::WalRecord(rec) => {
+                    let wal_desc = walrecord::describe_wal_record(&rec)?;
+                    format!(
+                        " rec {} bytes will_init: {} {}",
+                        buf.len(),
+                        rec.will_init(),
+                        wal_desc
+                    )
+                }
+            };
+            Ok(desc)
+        }
+
+        for entry in keys {
+            let DeltaEntry { key, lsn, val, .. } = entry;
+            let desc = match dump_blob(val).await {
+                Ok(desc) => desc,
+                Err(err) => {
+                    format!("ERROR: {err}")
+                }
+            };
+            println!("  key {key} at {lsn}: {desc}");
+        }
+
+        Ok(())
    }
 }

-impl<'a, T> Clone for Ref<&'a T> {
-    fn clone(&self) -> Self {
-        *self
-    }
-}
-
-impl<'a, T> Copy for Ref<&'a T> {}
-
 /// A set of data associated with a delta layer key and its value
-pub struct DeltaEntry<T: AsRef<DeltaLayerInner>> {
+pub struct DeltaEntry<'a> {
    pub key: Key,
    pub lsn: Lsn,
    /// Size of the stored value
    pub size: u64,
    /// Reference to the on-disk value
-    pub val: ValueRef<T>,
+    pub val: ValueRef<'a>,
 }

 /// Reference to an on-disk value
-pub struct ValueRef<T: AsRef<DeltaLayerInner>> {
+pub struct ValueRef<'a> {
    blob_ref: BlobRef,
-    reader: BlockCursor<Adapter<T>>,
+    reader: BlockCursor<'a>,
 }

-impl<T: AsRef<DeltaLayerInner>> ValueRef<T> {
+impl<'a> ValueRef<'a> {
    /// Loads the value from disk
    pub async fn load(&self) -> Result<Value> {
        // theoretically we *could* record an access time for each, but it does not really matter
@@ -1039,10 +842,16 @@ impl<T: AsRef<DeltaLayerInner>> ValueRef<T> {
    }
 }

-struct Adapter<T: AsRef<DeltaLayerInner>>(T);
+pub(crate) struct Adapter<T>(T);

-impl<T: AsRef<DeltaLayerInner>> BlockReader for Adapter<T> {
-    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
+    pub(crate) fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        self.0.as_ref().file.read_blk(blknum)
    }
 }
+
+impl AsRef<DeltaLayerInner> for DeltaLayerInner {
+    fn as_ref(&self) -> &DeltaLayerInner {
+        self
+    }
+}
--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
@@ -212,9 +212,20 @@ pub enum LayerFileName {
 }

 impl LayerFileName {
-    pub fn file_name(&self) -> String {
+    pub(crate) fn file_name(&self) -> String {
        self.to_string()
    }
+
+    /// Determines if this layer file is considered to be in future meaning we will discard these
+    /// layers during timeline initialization from the given disk_consistent_lsn.
+    pub(crate) fn is_in_future(&self, disk_consistent_lsn: Lsn) -> bool {
+        use LayerFileName::*;
+        match self {
+            Image(file_name) if file_name.lsn > disk_consistent_lsn => true,
+            Delta(file_name) if file_name.lsn_range.end > disk_consistent_lsn + 1 => true,
+            _ => false,
+        }
+    }
 }

 impl fmt::Display for LayerFileName {
@@ -263,8 +274,8 @@ impl serde::Serialize for LayerFileName {
        S: serde::Serializer,
    {
        match self {
-            Self::Image(fname) => serializer.serialize_str(&fname.to_string()),
-            Self::Delta(fname) => serializer.serialize_str(&fname.to_string()),
+            Self::Image(fname) => serializer.collect_str(fname),
+            Self::Delta(fname) => serializer.collect_str(fname),
        }
    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -31,22 +31,24 @@ use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
-    LayerAccessStats, PersistentLayer, ValueReconstructResult, ValueReconstructState,
+    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
 };
+use crate::tenant::Timeline;
 use crate::virtual_file::VirtualFile;
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use hex;
-use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
+use pageserver_api::models::LayerAccessKind;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs::{self, File};
+use std::fs::File;
 use std::io::Write;
 use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::path::{Path, PathBuf};
+use std::sync::Arc;
 use tokio::sync::OnceCell;
 use tracing::*;

@@ -57,7 +59,7 @@ use utils::{
 };

 use super::filename::ImageFileName;
-use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLayerDesc};
+use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer};

 ///
 /// Header stored in the beginning of the file
@@ -115,22 +117,14 @@ impl Summary {
    }
 }

-/// ImageLayer is the in-memory data structure associated with an on-disk image
-/// file.
-///
-/// We keep an ImageLayer in memory for each file, in the LayerMap. If a layer
-/// is in "loaded" state, we have a copy of the index in memory, in 'inner'.
-/// Otherwise the struct is just a placeholder for a file that exists on disk,
-/// and it needs to be loaded before using it in queries.
+/// This is used only from `pagectl`. Within pageserver, all layers are
+/// [`crate::tenant::storage_layer::Layer`], which can hold an [`ImageLayerInner`].
 pub struct ImageLayer {
-    path_or_conf: PathOrConf,
-
+    path: PathBuf,
    pub desc: PersistentLayerDesc,
    // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
    pub lsn: Lsn,
-
    access_stats: LayerAccessStats,
-
    inner: OnceCell<ImageLayerInner>,
 }

@@ -147,6 +141,8 @@ impl std::fmt::Debug for ImageLayer {
    }
 }

+/// ImageLayer is the in-memory data structure associated with an on-disk image
+/// file.
 pub struct ImageLayerInner {
    // values copied from summary
    index_start_blk: u32,
@@ -167,29 +163,11 @@ impl std::fmt::Debug for ImageLayerInner {
    }
 }

-#[async_trait::async_trait]
-impl Layer for ImageLayer {
-    /// debugging function to print out the contents of the layer
-    async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
-        println!(
-            "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
-            self.desc.tenant_id,
-            self.desc.timeline_id,
-            self.desc.key_range.start,
-            self.desc.key_range.end,
-            self.lsn,
-            self.desc.is_incremental,
-            self.desc.file_size
-        );
-
-        if !verbose {
-            return Ok(());
-        }
-
-        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
-        let file = &inner.file;
+impl ImageLayerInner {
+    pub(super) async fn dump(&self) -> anyhow::Result<()> {
+        let file = &self.file;
        let tree_reader =
-            DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);
+            DiskBtreeReader::<_, KEY_SIZE>::new(self.index_start_blk, self.index_root_blk, file);

        tree_reader.dump().await?;

@@ -202,43 +180,6 @@ impl Layer for ImageLayer {

        Ok(())
    }
-
-    /// Look up given page in the file
-    async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        assert!(self.desc.key_range.contains(&key));
-        assert!(lsn_range.start >= self.lsn);
-        assert!(lsn_range.end >= self.lsn);
-
-        let inner = self
-            .load(LayerAccessKind::GetValueReconstructData, ctx)
-            .await?;
-        inner
-            .get_value_reconstruct_data(key, reconstruct_state)
-            .await
-            // FIXME: makes no sense to dump paths
-            .with_context(|| format!("read {}", self.path().display()))
-    }
-
-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn get_key_range(&self) -> Range<Key> {
-        self.layer_desc().key_range.clone()
-    }
-
-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        self.layer_desc().lsn_range.clone()
-    }
-
-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn is_incremental(&self) -> bool {
-        self.layer_desc().is_incremental
-    }
 }

 /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
@@ -254,48 +195,19 @@ impl AsLayerDesc for ImageLayer {
    }
 }

-impl PersistentLayer for ImageLayer {
-    fn local_path(&self) -> Option<PathBuf> {
-        Some(self.path())
-    }
-
-    fn delete_resident_layer_file(&self) -> Result<()> {
-        // delete underlying file
-        fs::remove_file(self.path())?;
-        Ok(())
-    }
-
-    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        let layer_file_name = self.filename().file_name();
-        let lsn_range = self.get_lsn_range();
-
-        HistoricLayerInfo::Image {
-            layer_file_name,
-            layer_file_size: self.desc.file_size,
-            lsn_start: lsn_range.start,
-            remote: false,
-            access_stats: self.access_stats.as_api_model(reset),
-        }
-    }
-
-    fn access_stats(&self) -> &LayerAccessStats {
-        &self.access_stats
-    }
-}
-
 impl ImageLayer {
-    fn path_for(
-        path_or_conf: &PathOrConf,
-        timeline_id: TimelineId,
-        tenant_id: TenantId,
-        fname: &ImageFileName,
-    ) -> PathBuf {
-        match path_or_conf {
-            PathOrConf::Path(path) => path.to_path_buf(),
-            PathOrConf::Conf(conf) => conf
-                .timeline_path(&tenant_id, &timeline_id)
-                .join(fname.to_string()),
+    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+        self.desc.dump();
+
+        if !verbose {
+            return Ok(());
        }
+
+        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
+
+        inner.dump().await?;
+
+        Ok(())
    }

    fn temp_path_for(
@@ -333,53 +245,21 @@ impl ImageLayer {
    async fn load_inner(&self) -> Result<ImageLayerInner> {
        let path = self.path();

-        let expected_summary = match &self.path_or_conf {
-            PathOrConf::Conf(_) => Some(Summary::from(self)),
-            PathOrConf::Path(_) => None,
-        };
+        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None)?;

-        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary)?;
+        // not production code
+        let actual_filename = self.path.file_name().unwrap().to_str().unwrap().to_owned();
+        let expected_filename = self.layer_desc().filename().file_name();

-        if let PathOrConf::Path(ref path) = self.path_or_conf {
-            // not production code
-            let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
-            let expected_filename = self.filename().file_name();
-
-            if actual_filename != expected_filename {
-                println!("warning: filename does not match what is expected from in-file summary");
-                println!("actual: {:?}", actual_filename);
-                println!("expected: {:?}", expected_filename);
-            }
+        if actual_filename != expected_filename {
+            println!("warning: filename does not match what is expected from in-file summary");
+            println!("actual: {:?}", actual_filename);
+            println!("expected: {:?}", expected_filename);
        }

        Ok(loaded)
    }

-    /// Create an ImageLayer struct representing an existing file on disk
-    pub fn new(
-        conf: &'static PageServerConf,
-        timeline_id: TimelineId,
-        tenant_id: TenantId,
-        filename: &ImageFileName,
-        file_size: u64,
-        access_stats: LayerAccessStats,
-    ) -> ImageLayer {
-        ImageLayer {
-            path_or_conf: PathOrConf::Conf(conf),
-            desc: PersistentLayerDesc::new_img(
-                tenant_id,
-                timeline_id,
-                filename.key_range.clone(),
-                filename.lsn,
-                false,
-                file_size,
-            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
-            lsn: filename.lsn,
-            access_stats,
-            inner: OnceCell::new(),
-        }
-    }
-
    /// Create an ImageLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
@@ -392,13 +272,12 @@ impl ImageLayer {
            .metadata()
            .context("get file metadata to determine size")?;
        Ok(ImageLayer {
-            path_or_conf: PathOrConf::Path(path.to_path_buf()),
+            path: path.to_path_buf(),
            desc: PersistentLayerDesc::new_img(
                summary.tenant_id,
                summary.timeline_id,
                summary.key_range,
                summary.lsn,
-                false,
                metadata.len(),
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: summary.lsn,
@@ -407,18 +286,9 @@ impl ImageLayer {
        })
    }

-    fn layer_name(&self) -> ImageFileName {
-        self.desc.image_file_name()
-    }
-
    /// Path to the layer file in pageserver workdir.
-    pub fn path(&self) -> PathBuf {
-        Self::path_for(
-            &self.path_or_conf,
-            self.desc.timeline_id,
-            self.desc.tenant_id,
-            &self.layer_name(),
-        )
+    fn path(&self) -> PathBuf {
+        self.path.clone()
    }
 }

@@ -500,7 +370,6 @@ struct ImageLayerWriterInner {
    tenant_id: TenantId,
    key_range: Range<Key>,
    lsn: Lsn,
-    is_incremental: bool,

    blob_writer: WriteBlobWriter<VirtualFile>,
    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
@@ -516,7 +385,6 @@ impl ImageLayerWriterInner {
        tenant_id: TenantId,
        key_range: &Range<Key>,
        lsn: Lsn,
-        is_incremental: bool,
    ) -> anyhow::Result<Self> {
        // Create the file initially with a temporary filename.
        // We'll atomically rename it to the final name when we're done.
@@ -551,7 +419,6 @@ impl ImageLayerWriterInner {
            lsn,
            tree: tree_builder,
            blob_writer,
-            is_incremental,
        };

        Ok(writer)
@@ -576,7 +443,7 @@ impl ImageLayerWriterInner {
    ///
    /// Finish writing the image layer.
    ///
-    fn finish(self) -> anyhow::Result<ImageLayer> {
+    fn finish(self, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -612,40 +479,19 @@ impl ImageLayerWriterInner {
            self.timeline_id,
            self.key_range.clone(),
            self.lsn,
-            self.is_incremental, // for now, image layer ALWAYS covers the full range
            metadata.len(),
        );

        // Note: Because we open the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
-        let layer = ImageLayer {
-            path_or_conf: PathOrConf::Conf(self.conf),
-            desc,
-            lsn: self.lsn,
-            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: OnceCell::new(),
-        };

        // fsync the file
        file.sync_all()?;

-        // Rename the file to its final name
-        //
-        // Note: This overwrites any existing file. There shouldn't be any.
-        // FIXME: throw an error instead?
-        let final_path = ImageLayer::path_for(
-            &PathOrConf::Conf(self.conf),
-            self.timeline_id,
-            self.tenant_id,
-            &ImageFileName {
-                key_range: self.key_range.clone(),
-                lsn: self.lsn,
-            },
-        );
-        std::fs::rename(self.path, final_path)?;
+        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;

-        trace!("created image layer {}", layer.path().display());
+        trace!("created image layer {}", layer.local_path().display());

        Ok(layer)
    }
@@ -687,7 +533,6 @@ impl ImageLayerWriter {
        tenant_id: TenantId,
        key_range: &Range<Key>,
        lsn: Lsn,
-        is_incremental: bool,
    ) -> anyhow::Result<ImageLayerWriter> {
        Ok(Self {
            inner: Some(ImageLayerWriterInner::new(
@@ -696,7 +541,6 @@ impl ImageLayerWriter {
                tenant_id,
                key_range,
                lsn,
-                is_incremental,
            )?),
        })
    }
@@ -713,8 +557,11 @@ impl ImageLayerWriter {
    ///
    /// Finish writing the image layer.
    ///
-    pub fn finish(mut self) -> anyhow::Result<ImageLayer> {
-        self.inner.take().unwrap().finish()
+    pub(crate) fn finish(
+        mut self,
+        timeline: &Arc<Timeline>,
+    ) -> anyhow::Result<super::ResidentLayer> {
+        self.inner.take().unwrap().finish(timeline)
    }
 }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -7,16 +7,15 @@
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::repository::{Key, Value};
-use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
+use crate::tenant::Timeline;
 use crate::walrecord;
 use anyhow::{ensure, Result};
 use pageserver_api::models::InMemoryLayerInfo;
-use std::cell::RefCell;
 use std::collections::HashMap;
-use std::sync::OnceLock;
+use std::sync::{Arc, OnceLock};
 use tracing::*;
 use utils::{
    bin_ser::BeSer,
@@ -30,13 +29,7 @@ use std::fmt::Write as _;
 use std::ops::Range;
 use tokio::sync::RwLock;

-use super::{DeltaLayer, DeltaLayerWriter, Layer};
-
-thread_local! {
-    /// A buffer for serializing object during [`InMemoryLayer::put_value`].
-    /// This buffer is reused for each serialization to avoid additional malloc calls.
-    static SER_BUFFER: RefCell<Vec<u8>> = RefCell::new(Vec::new());
-}
+use super::{DeltaLayerWriter, ResidentLayer};

 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
@@ -85,11 +78,11 @@ impl std::fmt::Debug for InMemoryLayerInner {
 }

 impl InMemoryLayer {
-    pub fn get_timeline_id(&self) -> TimelineId {
+    pub(crate) fn get_timeline_id(&self) -> TimelineId {
        self.timeline_id
    }

-    pub fn info(&self) -> InMemoryLayerInfo {
+    pub(crate) fn info(&self) -> InMemoryLayerInfo {
        let lsn_start = self.start_lsn;

        if let Some(&lsn_end) = self.end_lsn.get() {
@@ -99,32 +92,22 @@ impl InMemoryLayer {
        }
    }

-    fn assert_writable(&self) {
+    pub(crate) fn assert_writable(&self) {
        assert!(self.end_lsn.get().is_none());
    }

-    fn end_lsn_or_max(&self) -> Lsn {
+    pub(crate) fn end_lsn_or_max(&self) -> Lsn {
        self.end_lsn.get().copied().unwrap_or(Lsn::MAX)
    }
-}

-#[async_trait::async_trait]
-impl Layer for InMemoryLayer {
-    fn get_key_range(&self) -> Range<Key> {
-        Key::MIN..Key::MAX
-    }
-
-    fn get_lsn_range(&self) -> Range<Lsn> {
+    pub(crate) fn get_lsn_range(&self) -> Range<Lsn> {
        self.start_lsn..self.end_lsn_or_max()
    }

-    fn is_incremental(&self) -> bool {
-        // in-memory layer is always considered incremental.
-        true
-    }
-
    /// debugging function to print out the contents of the layer
-    async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
+    ///
+    /// this is likely completly unused
+    pub async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
        let inner = self.inner.read().await;

        let end_str = self.end_lsn_or_max();
@@ -171,7 +154,7 @@ impl Layer for InMemoryLayer {
    }

    /// Look up given value in the layer.
-    async fn get_value_reconstruct_data(
+    pub(crate) async fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
@@ -229,17 +212,13 @@ impl std::fmt::Display for InMemoryLayer {
 }

 impl InMemoryLayer {
-    ///
    /// Get layer size.
-    ///
    pub async fn size(&self) -> Result<u64> {
        let inner = self.inner.read().await;
-        Ok(inner.file.size())
+        Ok(inner.file.len())
    }

-    ///
    /// Create a new, empty, in-memory layer
-    ///
    pub fn create(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
@@ -269,17 +248,17 @@ impl InMemoryLayer {
    /// Adds the page version to the in-memory tree
    pub async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
-        let mut inner = self.inner.write().await;
+        let inner: &mut _ = &mut *self.inner.write().await;
        self.assert_writable();

        let off = {
-            SER_BUFFER.with(|x| -> Result<_> {
-                let mut buf = x.borrow_mut();
-                buf.clear();
-                val.ser_into(&mut (*buf))?;
-                let off = inner.file.write_blob(&buf)?;
-                Ok(off)
-            })?
+            // Avoid doing allocations for "small" values.
+            // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
+            // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
+            let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
+            buf.clear();
+            val.ser_into(&mut buf)?;
+            inner.file.write_blob(&buf).await?
        };

        let vec_map = inner.index.entry(key).or_default();
@@ -317,7 +296,7 @@ impl InMemoryLayer {
    /// Write this frozen in-memory layer to disk.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
-    pub async fn write_to_disk(&self) -> Result<DeltaLayer> {
+    pub(crate) async fn write_to_disk(&self, timeline: &Arc<Timeline>) -> Result<ResidentLayer> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -356,7 +335,7 @@ impl InMemoryLayer {
            }
        }

-        let delta_layer = delta_layer_writer.finish(Key::MAX)?;
+        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline)?;
        Ok(delta_layer)
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -1,4 +1,3 @@
-use anyhow::Result;
 use core::fmt::Display;
 use std::ops::Range;
 use utils::{
@@ -6,7 +5,7 @@ use utils::{
    lsn::Lsn,
 };

-use crate::{context::RequestContext, repository::Key};
+use crate::repository::Key;

 use super::{DeltaFileName, ImageFileName, LayerFileName};

@@ -19,16 +18,17 @@ use serde::{Deserialize, Serialize};
 pub struct PersistentLayerDesc {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
+    /// Range of keys that this layer covers
    pub key_range: Range<Key>,
-    /// For image layer, this is `[lsn, lsn+1)`.
+    /// Inclusive start, exclusive end of the LSN range that this layer holds.
+    ///
+    /// - For an open in-memory layer, the end bound is MAX_LSN
+    /// - For a frozen in-memory layer or a delta layer, the end bound is a valid lsn after the
+    /// range start
+    /// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
    pub lsn_range: Range<Lsn>,
-    /// Whether this is a delta layer.
+    /// Whether this is a delta layer, and also, is this incremental.
    pub is_delta: bool,
-    /// Whether this layer only contains page images for part of the keys in the range. In the current implementation, this should
-    /// always be equal to `is_delta`. If we land the partial image layer PR someday, image layer could also be
-    /// incremental.
-    pub is_incremental: bool,
-    /// File size
    pub file_size: u64,
 }

@@ -61,7 +61,6 @@ impl PersistentLayerDesc {
            key_range,
            lsn_range: Lsn(0)..Lsn(1),
            is_delta: false,
-            is_incremental: false,
            file_size: 0,
        }
    }
@@ -71,7 +70,6 @@ impl PersistentLayerDesc {
        timeline_id: TimelineId,
        key_range: Range<Key>,
        lsn: Lsn,
-        is_incremental: bool,
        file_size: u64,
    ) -> Self {
        Self {
@@ -80,7 +78,6 @@ impl PersistentLayerDesc {
            key_range,
            lsn_range: Self::image_layer_lsn_range(lsn),
            is_delta: false,
-            is_incremental,
            file_size,
        }
    }
@@ -98,11 +95,26 @@ impl PersistentLayerDesc {
            key_range,
            lsn_range,
            is_delta: true,
-            is_incremental: true,
            file_size,
        }
    }

+    pub fn from_filename(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        filename: LayerFileName,
+        file_size: u64,
+    ) -> Self {
+        match filename {
+            LayerFileName::Image(i) => {
+                Self::new_img(tenant_id, timeline_id, i.key_range, i.lsn, file_size)
+            }
+            LayerFileName::Delta(d) => {
+                Self::new_delta(tenant_id, timeline_id, d.key_range, d.lsn_range, file_size)
+            }
+        }
+    }
+
    /// Get the LSN that the image layer covers.
    pub fn image_layer_lsn(&self) -> Lsn {
        assert!(!self.is_delta);
@@ -164,29 +176,43 @@ impl PersistentLayerDesc {
        self.tenant_id
    }

+    /// Does this layer only contain some data for the key-range (incremental),
+    /// or does it contain a version of every page? This is important to know
+    /// for garbage collecting old layers: an incremental layer depends on
+    /// the previous non-incremental layer.
    pub fn is_incremental(&self) -> bool {
-        self.is_incremental
+        self.is_delta
    }

    pub fn is_delta(&self) -> bool {
        self.is_delta
    }

-    pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
-        println!(
-            "----- layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
-            self.tenant_id,
-            self.timeline_id,
-            self.key_range.start,
-            self.key_range.end,
-            self.lsn_range.start,
-            self.lsn_range.end,
-            self.is_delta,
-            self.is_incremental,
-            self.file_size,
-        );
-
-        Ok(())
+    pub fn dump(&self) {
+        if self.is_delta {
+            println!(
+                "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} is_incremental {} size {} ----",
+                self.tenant_id,
+                self.timeline_id,
+                self.key_range.start,
+                self.key_range.end,
+                self.lsn_range.start,
+                self.lsn_range.end,
+                self.is_incremental(),
+                self.file_size,
+            );
+        } else {
+            println!(
+                "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
+                self.tenant_id,
+                self.timeline_id,
+                self.key_range.start,
+                self.key_range.end,
+                self.image_layer_lsn(),
+                self.is_incremental(),
+                self.file_size
+            );
+        }
    }

    pub fn file_size(&self) -> u64 {
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -1,254 +0,0 @@
-//! A RemoteLayer is an in-memory placeholder for a layer file that exists
-//! in remote storage.
-//!
-use crate::config::PageServerConf;
-use crate::context::RequestContext;
-use crate::repository::Key;
-use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
-use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
-use crate::tenant::timeline::layer_manager::LayerManager;
-use anyhow::{bail, Result};
-use pageserver_api::models::HistoricLayerInfo;
-use std::ops::Range;
-use std::path::PathBuf;
-use std::sync::Arc;
-
-use utils::{
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
-
-use super::filename::{DeltaFileName, ImageFileName};
-use super::{
-    AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset,
-    LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
-};
-
-/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
-/// [`DeltaLayer`](super::DeltaLayer).
-///
-/// RemoteLayer might be downloaded on-demand during operations which are
-/// allowed download remote layers and during which, it gets replaced with a
-/// concrete `DeltaLayer` or `ImageLayer`.
-///
-/// See: [`crate::context::RequestContext`] for authorization to download
-pub struct RemoteLayer {
-    pub desc: PersistentLayerDesc,
-
-    pub layer_metadata: LayerFileMetadata,
-
-    access_stats: LayerAccessStats,
-
-    pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
-
-    /// Has `LayerMap::replace` failed for this (true) or not (false).
-    ///
-    /// Used together with [`ongoing_download`] semaphore in `Timeline::download_remote_layer`.
-    /// The field is used to mark a RemoteLayer permanently (until restart or ignore+load)
-    /// unprocessable, because a LayerMap::replace failed.
-    ///
-    /// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
-    /// a possible fast loop between `Timeline::get_reconstruct_data` and
-    /// `Timeline::download_remote_layer`, which also logs.
-    ///
-    /// [`ongoing_download`]: Self::ongoing_download
-    pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
-}
-
-impl std::fmt::Debug for RemoteLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("RemoteLayer")
-            .field("file_name", &self.desc.filename())
-            .field("layer_metadata", &self.layer_metadata)
-            .field("is_incremental", &self.desc.is_incremental)
-            .finish()
-    }
-}
-
-#[async_trait::async_trait]
-impl Layer for RemoteLayer {
-    async fn get_value_reconstruct_data(
-        &self,
-        _key: Key,
-        _lsn_range: Range<Lsn>,
-        _reconstruct_state: &mut ValueReconstructState,
-        _ctx: &RequestContext,
-    ) -> Result<ValueReconstructResult> {
-        bail!("layer {self} needs to be downloaded");
-    }
-
-    /// debugging function to print out the contents of the layer
-    async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
-        println!(
-            "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
-            self.desc.tenant_id,
-            self.desc.timeline_id,
-            self.desc.key_range.start,
-            self.desc.key_range.end,
-            self.desc.lsn_range.start,
-            self.desc.lsn_range.end,
-            self.desc.is_delta,
-            self.desc.is_incremental,
-            self.desc.file_size,
-        );
-
-        Ok(())
-    }
-
-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn get_key_range(&self) -> Range<Key> {
-        self.layer_desc().key_range.clone()
-    }
-
-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        self.layer_desc().lsn_range.clone()
-    }
-
-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn is_incremental(&self) -> bool {
-        self.layer_desc().is_incremental
-    }
-}
-
-/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-impl std::fmt::Display for RemoteLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.layer_desc().short_id())
-    }
-}
-
-impl AsLayerDesc for RemoteLayer {
-    fn layer_desc(&self) -> &PersistentLayerDesc {
-        &self.desc
-    }
-}
-
-impl PersistentLayer for RemoteLayer {
-    fn local_path(&self) -> Option<PathBuf> {
-        None
-    }
-
-    fn delete_resident_layer_file(&self) -> Result<()> {
-        bail!("remote layer has no layer file");
-    }
-
-    fn downcast_remote_layer<'a>(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
-        Some(self)
-    }
-
-    fn is_remote_layer(&self) -> bool {
-        true
-    }
-
-    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        let layer_file_name = self.filename().file_name();
-        let lsn_range = self.get_lsn_range();
-
-        if self.desc.is_delta {
-            HistoricLayerInfo::Delta {
-                layer_file_name,
-                layer_file_size: self.layer_metadata.file_size(),
-                lsn_start: lsn_range.start,
-                lsn_end: lsn_range.end,
-                remote: true,
-                access_stats: self.access_stats.as_api_model(reset),
-            }
-        } else {
-            HistoricLayerInfo::Image {
-                layer_file_name,
-                layer_file_size: self.layer_metadata.file_size(),
-                lsn_start: lsn_range.start,
-                remote: true,
-                access_stats: self.access_stats.as_api_model(reset),
-            }
-        }
-    }
-
-    fn access_stats(&self) -> &LayerAccessStats {
-        &self.access_stats
-    }
-}
-
-impl RemoteLayer {
-    pub fn new_img(
-        tenantid: TenantId,
-        timelineid: TimelineId,
-        fname: &ImageFileName,
-        layer_metadata: &LayerFileMetadata,
-        access_stats: LayerAccessStats,
-    ) -> RemoteLayer {
-        RemoteLayer {
-            desc: PersistentLayerDesc::new_img(
-                tenantid,
-                timelineid,
-                fname.key_range.clone(),
-                fname.lsn,
-                false,
-                layer_metadata.file_size(),
-            ),
-            layer_metadata: layer_metadata.clone(),
-            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
-            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
-            access_stats,
-        }
-    }
-
-    pub fn new_delta(
-        tenantid: TenantId,
-        timelineid: TimelineId,
-        fname: &DeltaFileName,
-        layer_metadata: &LayerFileMetadata,
-        access_stats: LayerAccessStats,
-    ) -> RemoteLayer {
-        RemoteLayer {
-            desc: PersistentLayerDesc::new_delta(
-                tenantid,
-                timelineid,
-                fname.key_range.clone(),
-                fname.lsn_range.clone(),
-                layer_metadata.file_size(),
-            ),
-            layer_metadata: layer_metadata.clone(),
-            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
-            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
-            access_stats,
-        }
-    }
-
-    /// Create a Layer struct representing this layer, after it has been downloaded.
-    pub fn create_downloaded_layer(
-        &self,
-        layer_map_lock_held_witness: &LayerManager,
-        conf: &'static PageServerConf,
-        file_size: u64,
-    ) -> Arc<dyn PersistentLayer> {
-        if self.desc.is_delta {
-            let fname = self.desc.delta_file_name();
-            Arc::new(DeltaLayer::new(
-                conf,
-                self.desc.timeline_id,
-                self.desc.tenant_id,
-                &fname,
-                file_size,
-                self.access_stats.clone_for_residence_change(
-                    layer_map_lock_held_witness,
-                    LayerResidenceStatus::Resident,
-                ),
-            ))
-        } else {
-            let fname = self.desc.image_file_name();
-            Arc::new(ImageLayer::new(
-                conf,
-                self.desc.timeline_id,
-                self.desc.tenant_id,
-                &fname,
-                file_size,
-                self.access_stats.clone_for_residence_change(
-                    layer_map_lock_held_witness,
-                    LayerResidenceStatus::Resident,
-                ),
-            ))
-        }
-    }
-}
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -29,7 +29,6 @@ use crate::{
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
-        storage_layer::PersistentLayer,
        timeline::EvictionError,
        LogicalSizeCalculationCause, Tenant,
    },
@@ -194,15 +193,26 @@ impl Timeline {
        // NB: all the checks can be invalidated as soon as we release the layer map lock.
        // We don't want to hold the layer map lock during eviction.
        // So, we just need to deal with this.
-        let candidates: Vec<Arc<dyn PersistentLayer>> = {
+        let candidates: Vec<_> = {
            let guard = self.layers.read().await;
            let layers = guard.layer_map();
            let mut candidates = Vec::new();
            for hist_layer in layers.iter_historic_layers() {
                let hist_layer = guard.get_from_desc(&hist_layer);
-                if hist_layer.is_remote_layer() {
-                    continue;
-                }
+
+                // guard against eviction while we inspect it; it might be that eviction_task and
+                // disk_usage_eviction_task both select the same layers to be evicted, and
+                // seemingly free up double the space. both succeeding is of no consequence.
+                let guard = match hist_layer.keep_resident().await {
+                    Ok(Some(l)) => l,
+                    Ok(None) => continue,
+                    Err(e) => {
+                        // these should not happen, but we cannot make them statically impossible right
+                        // now.
+                        tracing::warn!(layer=%hist_layer, "failed to keep the layer resident: {e:#}");
+                        continue;
+                    }
+                };

                let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| {
                    // We only use this fallback if there's an implementation error.
@@ -233,7 +243,7 @@ impl Timeline {
                    }
                };
                if no_activity_for > p.threshold {
-                    candidates.push(hist_layer)
+                    candidates.push(guard.drop_eviction_guard())
                }
            }
            candidates
@@ -252,7 +262,7 @@ impl Timeline {
        };

        let results = match self
-            .evict_layer_batch(remote_client, &candidates[..], cancel.clone())
+            .evict_layer_batch(remote_client, &candidates, cancel)
            .await
        {
            Err(pre_err) => {
@@ -263,7 +273,7 @@ impl Timeline {
            Ok(results) => results,
        };
        assert_eq!(results.len(), candidates.len());
-        for (l, result) in candidates.iter().zip(results) {
+        for result in results {
            match result {
                None => {
                    stats.skipped_for_shutdown += 1;
@@ -271,20 +281,10 @@ impl Timeline {
                Some(Ok(())) => {
                    stats.evicted += 1;
                }
-                Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
-                    stats.not_evictable += 1;
-                }
-                Some(Err(EvictionError::FileNotFound)) => {
+                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
                    // compaction/gc removed the file while we were waiting on layer_removal_cs
                    stats.not_evictable += 1;
                }
-                Some(Err(
-                    e @ EvictionError::LayerNotFound(_) | e @ EvictionError::StatFailed(_),
-                )) => {
-                    let e = utils::error::report_compact_sources(&e);
-                    warn!(layer = %l, "failed to evict layer: {e}");
-                    stats.not_evictable += 1;
-                }
            }
        }
        if stats.candidates == stats.not_evictable {
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -0,0 +1,199 @@
+use crate::{
+    is_temporary,
+    tenant::{
+        ephemeral_file::is_ephemeral_file,
+        remote_timeline_client::{
+            self,
+            index::{IndexPart, LayerFileMetadata},
+        },
+        storage_layer::LayerFileName,
+    },
+    METADATA_FILE_NAME,
+};
+use anyhow::Context;
+use std::{collections::HashMap, ffi::OsString, path::Path, str::FromStr};
+use utils::lsn::Lsn;
+
+/// Identified files in the timeline directory.
+pub(super) enum Discovered {
+    /// The only one we care about
+    Layer(LayerFileName, u64),
+    /// Old ephmeral files from previous launches, should be removed
+    Ephemeral(OsString),
+    /// Old temporary timeline files, unsure what these really are, should be removed
+    Temporary(OsString),
+    /// Temporary on-demand download files, should be removed
+    TemporaryDownload(OsString),
+    /// "metadata" file we persist locally and include in `index_part.json`
+    Metadata,
+    /// Backup file from previously future layers
+    IgnoredBackup,
+    /// Unrecognized, warn about these
+    Unknown(OsString),
+}
+
+/// Scans the timeline directory for interesting files.
+pub(super) fn scan_timeline_dir(path: &Path) -> anyhow::Result<Vec<Discovered>> {
+    let mut ret = Vec::new();
+
+    for direntry in std::fs::read_dir(path)? {
+        let direntry = direntry?;
+        let direntry_path = direntry.path();
+        let file_name = direntry.file_name();
+
+        let fname = file_name.to_string_lossy();
+
+        let discovered = match LayerFileName::from_str(&fname) {
+            Ok(file_name) => {
+                let file_size = direntry.metadata()?.len();
+                Discovered::Layer(file_name, file_size)
+            }
+            Err(_) => {
+                if fname == METADATA_FILE_NAME {
+                    Discovered::Metadata
+                } else if fname.ends_with(".old") {
+                    // ignore these
+                    Discovered::IgnoredBackup
+                } else if remote_timeline_client::is_temp_download_file(&direntry_path) {
+                    Discovered::TemporaryDownload(file_name)
+                } else if is_ephemeral_file(&fname) {
+                    Discovered::Ephemeral(file_name)
+                } else if is_temporary(&direntry_path) {
+                    Discovered::Temporary(file_name)
+                } else {
+                    Discovered::Unknown(file_name)
+                }
+            }
+        };
+
+        ret.push(discovered);
+    }
+
+    Ok(ret)
+}
+
+/// Decision on what to do with a layer file after considering its local and remote metadata.
+#[derive(Clone)]
+pub(super) enum Decision {
+    /// The layer is not present locally.
+    Evicted(LayerFileMetadata),
+    /// The layer is present locally, but local metadata does not match remote; we must
+    /// delete it and treat it as evicted.
+    UseRemote {
+        local: LayerFileMetadata,
+        remote: LayerFileMetadata,
+    },
+    /// The layer is present locally, and metadata matches.
+    UseLocal(LayerFileMetadata),
+    /// The layer is only known locally, it needs to be uploaded.
+    NeedsUpload(LayerFileMetadata),
+}
+
+/// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded.
+#[derive(Debug)]
+pub(super) struct FutureLayer {
+    /// The local metadata. `None` if the layer is only known through [`IndexPart`].
+    pub(super) local: Option<LayerFileMetadata>,
+}
+
+/// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
+///
+/// This function should not gain additional reasons to fail than [`FutureLayer`], consider adding
+/// the checks earlier to [`scan_timeline_dir`].
+pub(super) fn reconcile(
+    discovered: Vec<(LayerFileName, u64)>,
+    index_part: Option<&IndexPart>,
+    disk_consistent_lsn: Lsn,
+) -> Vec<(LayerFileName, Result<Decision, FutureLayer>)> {
+    use Decision::*;
+
+    // name => (local, remote)
+    type Collected = HashMap<LayerFileName, (Option<LayerFileMetadata>, Option<LayerFileMetadata>)>;
+
+    let mut discovered = discovered
+        .into_iter()
+        .map(|(name, file_size)| (name, (Some(LayerFileMetadata::new(file_size)), None)))
+        .collect::<Collected>();
+
+    // merge any index_part information, when available
+    index_part
+        .as_ref()
+        .map(|ip| ip.layer_metadata.iter())
+        .into_iter()
+        .flatten()
+        .map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
+        .for_each(|(name, metadata)| {
+            if let Some(existing) = discovered.get_mut(name) {
+                existing.1 = Some(metadata);
+            } else {
+                discovered.insert(name.to_owned(), (None, Some(metadata)));
+            }
+        });
+
+    discovered
+        .into_iter()
+        .map(|(name, (local, remote))| {
+            let decision = if name.is_in_future(disk_consistent_lsn) {
+                Err(FutureLayer { local })
+            } else {
+                Ok(match (local, remote) {
+                    (Some(local), Some(remote)) if local != remote => UseRemote { local, remote },
+                    (Some(x), Some(_)) => UseLocal(x),
+                    (None, Some(x)) => Evicted(x),
+                    (Some(x), None) => NeedsUpload(x),
+                    (None, None) => {
+                        unreachable!("there must not be any non-local non-remote files")
+                    }
+                })
+            };
+
+            (name, decision)
+        })
+        .collect::<Vec<_>>()
+}
+
+pub(super) fn cleanup(path: &Path, kind: &str) -> anyhow::Result<()> {
+    let file_name = path.file_name().expect("must be file path");
+    tracing::debug!(kind, ?file_name, "cleaning up");
+    std::fs::remove_file(path)
+        .with_context(|| format!("failed to remove {kind} at {}", path.display()))
+}
+
+pub(super) fn cleanup_local_file_for_remote(
+    path: &Path,
+    local: &LayerFileMetadata,
+    remote: &LayerFileMetadata,
+) -> anyhow::Result<()> {
+    let local_size = local.file_size();
+    let remote_size = remote.file_size();
+
+    let file_name = path.file_name().expect("must be file path");
+    tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
+    if let Err(err) = crate::tenant::timeline::rename_to_backup(path) {
+        assert!(
+            path.exists(),
+            "we would leave the local_layer without a file if this does not hold: {}",
+            path.display()
+        );
+        Err(err)
+    } else {
+        Ok(())
+    }
+}
+
+pub(super) fn cleanup_future_layer(
+    path: &Path,
+    name: &LayerFileName,
+    disk_consistent_lsn: Lsn,
+) -> anyhow::Result<()> {
+    use LayerFileName::*;
+    let kind = match name {
+        Delta(_) => "delta",
+        Image(_) => "image",
+    };
+    // future image layers are allowed to be produced always for not yet flushed to disk
+    // lsns stored in InMemoryLayer.
+    tracing::info!("found future {kind} layer {name} disk_consistent_lsn is {disk_consistent_lsn}");
+    crate::tenant::timeline::rename_to_backup(path)?;
+    Ok(())
+}
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -8,42 +8,40 @@ use utils::{

 use crate::{
    config::PageServerConf,
-    metrics::TimelineMetrics,
    tenant::{
        layer_map::{BatchedUpdates, LayerMap},
        storage_layer::{
-            AsLayerDesc, DeltaLayer, ImageLayer, InMemoryLayer, Layer, PersistentLayer,
-            PersistentLayerDesc, PersistentLayerKey, RemoteLayer,
+            AsLayerDesc, InMemoryLayer, Layer, PersistentLayerDesc, PersistentLayerKey,
+            ResidentLayer,
        },
-        timeline::compare_arced_layers,
    },
 };

 /// Provides semantic APIs to manipulate the layer map.
-pub struct LayerManager {
+pub(crate) struct LayerManager {
    layer_map: LayerMap,
-    layer_fmgr: LayerFileManager,
+    layer_fmgr: LayerFileManager<Layer>,
 }

 /// After GC, the layer map changes will not be applied immediately. Users should manually apply the changes after
 /// scheduling deletes in remote client.
-pub struct ApplyGcResultGuard<'a>(BatchedUpdates<'a>);
+pub(crate) struct ApplyGcResultGuard<'a>(BatchedUpdates<'a>);

 impl ApplyGcResultGuard<'_> {
-    pub fn flush(self) {
+    pub(crate) fn flush(self) {
        self.0.flush();
    }
 }

 impl LayerManager {
-    pub fn create() -> Self {
+    pub(crate) fn create() -> Self {
        Self {
            layer_map: LayerMap::default(),
            layer_fmgr: LayerFileManager::new(),
        }
    }

-    pub fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
+    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
        self.layer_fmgr.get_from_desc(desc)
    }

@@ -51,31 +49,16 @@ impl LayerManager {
    ///
    /// We expect users only to be able to get an immutable layer map. If users want to make modifications,
    /// they should use the below semantic APIs. This design makes us step closer to immutable storage state.
-    pub fn layer_map(&self) -> &LayerMap {
+    pub(crate) fn layer_map(&self) -> &LayerMap {
        &self.layer_map
    }

-    /// Get a mutable reference to the layer map. This function will be removed once `flush_frozen_layer`
-    /// gets a refactor.
-    pub fn layer_map_mut(&mut self) -> &mut LayerMap {
-        &mut self.layer_map
-    }
-
-    /// Replace layers in the layer file manager, used in evictions and layer downloads.
-    pub fn replace_and_verify(
-        &mut self,
-        expected: Arc<dyn PersistentLayer>,
-        new: Arc<dyn PersistentLayer>,
-    ) -> Result<()> {
-        self.layer_fmgr.replace_and_verify(expected, new)
-    }
-
    /// Called from `load_layer_map`. Initialize the layer manager with:
    /// 1. all on-disk layers
    /// 2. next open layer (with disk disk_consistent_lsn LSN)
-    pub fn initialize_local_layers(
+    pub(crate) fn initialize_local_layers(
        &mut self,
-        on_disk_layers: Vec<Arc<dyn PersistentLayer>>,
+        on_disk_layers: Vec<Layer>,
        next_open_layer_at: Lsn,
    ) {
        let mut updates = self.layer_map.batch_update();
@@ -87,28 +70,13 @@ impl LayerManager {
    }

    /// Initialize when creating a new timeline, called in `init_empty_layer_map`.
-    pub fn initialize_empty(&mut self, next_open_layer_at: Lsn) {
+    pub(crate) fn initialize_empty(&mut self, next_open_layer_at: Lsn) {
        self.layer_map.next_open_layer_at = Some(next_open_layer_at);
    }

-    pub fn initialize_remote_layers(
-        &mut self,
-        corrupted_local_layers: Vec<Arc<dyn PersistentLayer>>,
-        remote_layers: Vec<Arc<RemoteLayer>>,
-    ) {
-        let mut updates = self.layer_map.batch_update();
-        for layer in corrupted_local_layers {
-            Self::remove_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
-        }
-        for layer in remote_layers {
-            Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
-        }
-        updates.flush();
-    }
-
    /// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
    /// called within `get_layer_for_write`.
-    pub fn get_layer_for_write(
+    pub(crate) fn get_layer_for_write(
        &mut self,
        lsn: Lsn,
        last_record_lsn: Lsn,
@@ -163,7 +131,7 @@ impl LayerManager {
    }

    /// Called from `freeze_inmem_layer`, returns true if successfully frozen.
-    pub async fn try_freeze_in_memory_layer(
+    pub(crate) async fn try_freeze_in_memory_layer(
        &mut self,
        Lsn(last_record_lsn): Lsn,
        last_freeze_at: &AtomicLsn,
@@ -185,117 +153,101 @@ impl LayerManager {
    }

    /// Add image layers to the layer map, called from `create_image_layers`.
-    pub fn track_new_image_layers(&mut self, image_layers: Vec<ImageLayer>) {
+    pub(crate) fn track_new_image_layers(&mut self, image_layers: &[ResidentLayer]) {
        let mut updates = self.layer_map.batch_update();
        for layer in image_layers {
-            Self::insert_historic_layer(Arc::new(layer), &mut updates, &mut self.layer_fmgr);
+            Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
        }
        updates.flush();
    }

    /// Flush a frozen layer and add the written delta layer to the layer map.
-    pub fn finish_flush_l0_layer(
+    pub(crate) fn finish_flush_l0_layer(
        &mut self,
-        delta_layer: Option<DeltaLayer>,
+        delta_layer: Option<&ResidentLayer>,
        frozen_layer_for_check: &Arc<InMemoryLayer>,
    ) {
-        let l = self.layer_map.frozen_layers.pop_front();
-        let mut updates = self.layer_map.batch_update();
+        let inmem = self
+            .layer_map
+            .frozen_layers
+            .pop_front()
+            .expect("there must be a inmem layer to flush");

-        // Only one thread may call this function at a time (for this
-        // timeline). If two threads tried to flush the same frozen
+        // Only one task may call this function at a time (for this
+        // timeline). If two tasks tried to flush the same frozen
        // layer to disk at the same time, that would not work.
-        assert!(compare_arced_layers(&l.unwrap(), frozen_layer_for_check));
+        assert_eq!(Arc::as_ptr(&inmem), Arc::as_ptr(frozen_layer_for_check));

-        if let Some(delta_layer) = delta_layer {
-            Self::insert_historic_layer(Arc::new(delta_layer), &mut updates, &mut self.layer_fmgr);
+        if let Some(l) = delta_layer {
+            let mut updates = self.layer_map.batch_update();
+            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
+            updates.flush();
        }
-        updates.flush();
    }

    /// Called when compaction is completed.
-    pub fn finish_compact_l0(
+    pub(crate) fn finish_compact_l0(
        &mut self,
-        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
-        compact_from: Vec<Arc<dyn PersistentLayer>>,
-        compact_to: Vec<Arc<dyn PersistentLayer>>,
-        metrics: &TimelineMetrics,
+        layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
+        compact_from: Vec<Layer>,
+        compact_to: &[ResidentLayer],
+        duplicates: &[(ResidentLayer, ResidentLayer)],
    ) -> Result<()> {
        let mut updates = self.layer_map.batch_update();
        for l in compact_to {
-            Self::insert_historic_layer(l, &mut updates, &mut self.layer_fmgr);
+            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
        }
        for l in compact_from {
            // NB: the layer file identified by descriptor `l` is guaranteed to be present
            // in the LayerFileManager because compaction kept holding `layer_removal_cs` the entire
            // time, even though we dropped `Timeline::layers` inbetween.
-            Self::delete_historic_layer(
-                layer_removal_cs.clone(),
-                l,
-                &mut updates,
-                metrics,
-                &mut self.layer_fmgr,
-            )?;
+            Self::delete_historic_layer(layer_removal_cs, l, &mut updates, &mut self.layer_fmgr)?;
+        }
+        for (old, new) in duplicates {
+            self.layer_fmgr.replace(old.as_ref(), new.as_ref().clone());
        }
        updates.flush();
        Ok(())
    }

    /// Called when garbage collect the timeline. Returns a guard that will apply the updates to the layer map.
-    pub fn finish_gc_timeline(
+    pub(crate) fn finish_gc_timeline(
        &mut self,
-        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
-        gc_layers: Vec<Arc<dyn PersistentLayer>>,
-        metrics: &TimelineMetrics,
+        layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
+        gc_layers: Vec<Layer>,
    ) -> Result<ApplyGcResultGuard> {
        let mut updates = self.layer_map.batch_update();
        for doomed_layer in gc_layers {
            Self::delete_historic_layer(
-                layer_removal_cs.clone(),
+                layer_removal_cs,
                doomed_layer,
                &mut updates,
-                metrics,
                &mut self.layer_fmgr,
-            )?; // FIXME: schedule succeeded deletions in timeline.rs `gc_timeline` instead of in batch?
+            )?;
        }
        Ok(ApplyGcResultGuard(updates))
    }

    /// Helper function to insert a layer into the layer map and file manager.
    fn insert_historic_layer(
-        layer: Arc<dyn PersistentLayer>,
+        layer: Layer,
        updates: &mut BatchedUpdates<'_>,
-        mapping: &mut LayerFileManager,
+        mapping: &mut LayerFileManager<Layer>,
    ) {
        updates.insert_historic(layer.layer_desc().clone());
        mapping.insert(layer);
    }

-    /// Helper function to remove a layer into the layer map and file manager
-    fn remove_historic_layer(
-        layer: Arc<dyn PersistentLayer>,
-        updates: &mut BatchedUpdates<'_>,
-        mapping: &mut LayerFileManager,
-    ) {
-        updates.remove_historic(layer.layer_desc());
-        mapping.remove(layer);
-    }
-
    /// Removes the layer from local FS (if present) and from memory.
    /// Remote storage is not affected by this operation.
    fn delete_historic_layer(
        // we cannot remove layers otherwise, since gc and compaction will race
-        _layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
-        layer: Arc<dyn PersistentLayer>,
+        _layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
+        layer: Layer,
        updates: &mut BatchedUpdates<'_>,
-        metrics: &TimelineMetrics,
-        mapping: &mut LayerFileManager,
+        mapping: &mut LayerFileManager<Layer>,
    ) -> anyhow::Result<()> {
        let desc = layer.layer_desc();
-        if !layer.is_remote_layer() {
-            layer.delete_resident_layer_file()?;
-            metrics.resident_physical_size_gauge.sub(desc.file_size);
-        }

        // TODO Removing from the bottom of the layer map is expensive.
        //      Maybe instead discard all layer map historic versions that
@@ -303,22 +255,21 @@ impl LayerManager {
        //      and mark what we can't delete yet as deleted from the layer
        //      map index without actually rebuilding the index.
        updates.remove_historic(desc);
-        mapping.remove(layer);
+        mapping.remove(&layer);
+        layer.garbage_collect_on_drop();

        Ok(())
    }

-    pub(crate) fn contains(&self, layer: &Arc<dyn PersistentLayer>) -> bool {
+    pub(crate) fn contains(&self, layer: &Layer) -> bool {
        self.layer_fmgr.contains(layer)
    }
 }

-pub struct LayerFileManager<T: AsLayerDesc + ?Sized = dyn PersistentLayer>(
-    HashMap<PersistentLayerKey, Arc<T>>,
-);
+pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);

-impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
-    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<T> {
+impl<T: AsLayerDesc + Clone + PartialEq + std::fmt::Debug> LayerFileManager<T> {
+    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
        // The assumption for the `expect()` is that all code maintains the following invariant:
        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
        self.0
@@ -328,14 +279,14 @@ impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
            .clone()
    }

-    pub(crate) fn insert(&mut self, layer: Arc<T>) {
+    pub(crate) fn insert(&mut self, layer: T) {
        let present = self.0.insert(layer.layer_desc().key(), layer.clone());
        if present.is_some() && cfg!(debug_assertions) {
            panic!("overwriting a layer: {:?}", layer.layer_desc())
        }
    }

-    pub(crate) fn contains(&self, layer: &Arc<T>) -> bool {
+    pub(crate) fn contains(&self, layer: &T) -> bool {
        self.0.contains_key(&layer.layer_desc().key())
    }

@@ -343,7 +294,7 @@ impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
        Self(HashMap::new())
    }

-    pub(crate) fn remove(&mut self, layer: Arc<T>) {
+    pub(crate) fn remove(&mut self, layer: &T) {
        let present = self.0.remove(&layer.layer_desc().key());
        if present.is_none() && cfg!(debug_assertions) {
            panic!(
@@ -353,38 +304,13 @@ impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
        }
    }

-    pub(crate) fn replace_and_verify(&mut self, expected: Arc<T>, new: Arc<T>) -> Result<()> {
-        let key = expected.layer_desc().key();
-        let other = new.layer_desc().key();
+    pub(crate) fn replace(&mut self, old: &T, new: T) {
+        let key = old.layer_desc().key();
+        assert_eq!(key, new.layer_desc().key());

-        let expected_l0 = LayerMap::is_l0(expected.layer_desc());
-        let new_l0 = LayerMap::is_l0(new.layer_desc());
-
-        fail::fail_point!("layermap-replace-notfound", |_| anyhow::bail!(
-            "layermap-replace-notfound"
-        ));
-
-        anyhow::ensure!(
-            key == other,
-            "expected and new layer have different keys: {key:?} != {other:?}"
-        );
-
-        anyhow::ensure!(
-            expected_l0 == new_l0,
-            "one layer is l0 while the other is not: {expected_l0} != {new_l0}"
-        );
-
-        if let Some(layer) = self.0.get_mut(&key) {
-            anyhow::ensure!(
-                compare_arced_layers(&expected, layer),
-                "another layer was found instead of expected, expected={expected:?}, new={new:?}",
-                expected = Arc::as_ptr(&expected),
-                new = Arc::as_ptr(layer),
-            );
-            *layer = new;
-            Ok(())
-        } else {
-            anyhow::bail!("layer was not found");
+        if let Some(existing) = self.0.get_mut(&key) {
+            assert_eq!(existing, old);
+            *existing = new;
        }
    }
 }
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -17,7 +17,7 @@ use crate::metrics::{
    WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
    WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
 };
-use crate::task_mgr::TaskKind;
+use crate::task_mgr::{shutdown_token, TaskKind};
 use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
@@ -211,11 +211,14 @@ async fn subscribe_for_timeline_updates(
    id: TenantTimelineId,
 ) -> Streaming<SafekeeperTimelineInfo> {
    let mut attempt = 0;
+    let cancel = shutdown_token();
+
    loop {
        exponential_backoff(
            attempt,
            DEFAULT_BASE_BACKOFF_SECONDS,
            DEFAULT_MAX_BACKOFF_SECONDS,
+            &cancel,
        )
        .await;
        attempt += 1;
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -1,6 +1,7 @@
 use crate::metrics::RemoteOpFileKind;

 use super::storage_layer::LayerFileName;
+use super::storage_layer::ResidentLayer;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
@@ -148,17 +149,16 @@ impl UploadQueue {
            );
        }

-        let index_part_metadata = index_part.parse_metadata()?;
        info!(
            "initializing upload queue with remote index_part.disk_consistent_lsn: {}",
-            index_part_metadata.disk_consistent_lsn()
+            index_part.metadata.disk_consistent_lsn()
        );

        let state = UploadQueueInitialized {
            latest_files: files,
            latest_files_changes_since_metadata_upload_scheduled: 0,
-            latest_metadata: index_part_metadata.clone(),
-            last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
+            latest_metadata: index_part.metadata.clone(),
+            last_uploaded_consistent_lsn: index_part.metadata.disk_consistent_lsn(),
            // what follows are boring default initializations
            task_counter: 0,
            num_inprogress_layer_uploads: 0,
@@ -211,7 +211,7 @@ pub(crate) struct Delete {
 #[derive(Debug)]
 pub(crate) enum UploadOp {
    /// Upload a layer file
-    UploadLayer(LayerFileName, LayerFileMetadata),
+    UploadLayer(ResidentLayer, LayerFileMetadata),

    /// Upload the metadata file
    UploadMetadata(IndexPart, Lsn),
@@ -226,13 +226,8 @@ pub(crate) enum UploadOp {
 impl std::fmt::Display for UploadOp {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        match self {
-            UploadOp::UploadLayer(path, metadata) => {
-                write!(
-                    f,
-                    "UploadLayer({}, size={:?})",
-                    path.file_name(),
-                    metadata.file_size()
-                )
+            UploadOp::UploadLayer(layer, metadata) => {
+                write!(f, "UploadLayer({}, size={:?})", layer, metadata.file_size())
            }
            UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
            UploadOp::Delete(delete) => write!(
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -408,9 +408,9 @@ async fn connect_to_compute_once(
    let (tx, mut rx) = tokio::sync::watch::channel(session);

    let conn_id = uuid::Uuid::new_v4();
-    let span = info_span!(parent: None, "connection", %conn_info, %conn_id);
+    let span = info_span!(parent: None, "connection", %conn_id);
    span.in_scope(|| {
-        info!(%session, "new connection");
+        info!(%conn_info, %session, "new connection");
    });

    tokio::spawn(
@@ -420,26 +420,28 @@ async fn connect_to_compute_once(
                info!(%session, "changed session");
            }

-            let message = ready!(connection.poll_message(cx));
+            loop {
+                let message = ready!(connection.poll_message(cx));

-            match message {
-                Some(Ok(AsyncMessage::Notice(notice))) => {
-                    info!(%session, "notice: {}", notice);
-                    Poll::Pending
+                match message {
+                    Some(Ok(AsyncMessage::Notice(notice))) => {
+                        info!(%session, "notice: {}", notice);
+                    }
+                    Some(Ok(AsyncMessage::Notification(notif))) => {
+                        warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
+                    }
+                    Some(Ok(_)) => {
+                        warn!(%session, "unknown message");
+                    }
+                    Some(Err(e)) => {
+                        error!(%session, "connection error: {}", e);
+                        return Poll::Ready(())
+                    }
+                    None => {
+                        info!("connection closed");
+                        return Poll::Ready(())
+                    }
                }
-                Some(Ok(AsyncMessage::Notification(notif))) => {
-                    warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
-                    Poll::Pending
-                }
-                Some(Ok(_)) => {
-                    warn!(%session, "unknown message");
-                    Poll::Pending
-                }
-                Some(Err(e)) => {
-                    error!(%session, "connection error: {}", e);
-                    Poll::Ready(())
-                }
-                None => Poll::Ready(()),
            }
        })
        .instrument(span)
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -15,8 +15,10 @@ use tokio::fs::File;
 use tokio::io::AsyncReadExt;
 use utils::http::endpoint::request_span;

+use crate::receive_wal::WalReceiverState;
 use crate::safekeeper::ServerInfo;
 use crate::safekeeper::Term;
+use crate::send_wal::WalSenderState;
 use crate::{debug_dump, pull_timeline};

 use crate::timelines_global_map::TimelineDeleteForceResult;
@@ -99,6 +101,8 @@ pub struct TimelineStatus {
    pub peer_horizon_lsn: Lsn,
    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn: Lsn,
+    pub walsenders: Vec<WalSenderState>,
+    pub walreceivers: Vec<WalReceiverState>,
 }

 fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Result<(), ApiError> {
@@ -149,6 +153,8 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
        backup_lsn: inmem.backup_lsn,
        peer_horizon_lsn: inmem.peer_horizon_lsn,
        remote_consistent_lsn: tli.get_walsenders().get_remote_consistent_lsn(),
+        walsenders: tli.get_walsenders().get_all(),
+        walreceivers: tli.get_walreceivers().get_all(),
    };
    json_response(StatusCode::OK, status)
 }
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -11,11 +11,16 @@ use crate::wal_service::ConnectionId;
 use crate::GlobalTimelines;
 use anyhow::{anyhow, Context};
 use bytes::BytesMut;
+use parking_lot::MappedMutexGuard;
+use parking_lot::Mutex;
+use parking_lot::MutexGuard;
 use postgres_backend::CopyStreamHandlerEnd;
 use postgres_backend::PostgresBackend;
 use postgres_backend::PostgresBackendReader;
 use postgres_backend::QueryError;
 use pq_proto::BeMessage;
+use serde::Deserialize;
+use serde::Serialize;
 use std::net::SocketAddr;
 use std::sync::Arc;
 use tokio::io::AsyncRead;
@@ -32,6 +37,105 @@ use tracing::*;
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;

+/// Registry of WalReceivers (compute connections). Timeline holds it (wrapped
+/// in Arc).
+pub struct WalReceivers {
+    mutex: Mutex<WalReceiversShared>,
+}
+
+/// Id under which walreceiver is registered in shmem.
+type WalReceiverId = usize;
+
+impl WalReceivers {
+    pub fn new() -> Arc<WalReceivers> {
+        Arc::new(WalReceivers {
+            mutex: Mutex::new(WalReceiversShared { slots: Vec::new() }),
+        })
+    }
+
+    /// Register new walreceiver. Returned guard provides access to the slot and
+    /// automatically deregisters in Drop.
+    pub fn register(self: &Arc<WalReceivers>) -> WalReceiverGuard {
+        let slots = &mut self.mutex.lock().slots;
+        let walreceiver = WalReceiverState::Voting;
+        // find empty slot or create new one
+        let pos = if let Some(pos) = slots.iter().position(|s| s.is_none()) {
+            slots[pos] = Some(walreceiver);
+            pos
+        } else {
+            let pos = slots.len();
+            slots.push(Some(walreceiver));
+            pos
+        };
+        WalReceiverGuard {
+            id: pos,
+            walreceivers: self.clone(),
+        }
+    }
+
+    /// Get reference to locked slot contents. Slot must exist (registered
+    /// earlier).
+    fn get_slot<'a>(
+        self: &'a Arc<WalReceivers>,
+        id: WalReceiverId,
+    ) -> MappedMutexGuard<'a, WalReceiverState> {
+        MutexGuard::map(self.mutex.lock(), |locked| {
+            locked.slots[id]
+                .as_mut()
+                .expect("walreceiver doesn't exist")
+        })
+    }
+
+    /// Get number of walreceivers (compute connections).
+    pub fn get_num(self: &Arc<WalReceivers>) -> usize {
+        self.mutex.lock().slots.iter().flatten().count()
+    }
+
+    /// Get state of all walreceivers.
+    pub fn get_all(self: &Arc<WalReceivers>) -> Vec<WalReceiverState> {
+        self.mutex.lock().slots.iter().flatten().cloned().collect()
+    }
+
+    /// Unregister walsender.
+    fn unregister(self: &Arc<WalReceivers>, id: WalReceiverId) {
+        let mut shared = self.mutex.lock();
+        shared.slots[id] = None;
+    }
+}
+
+/// Only a few connections are expected (normally one), so store in Vec.
+struct WalReceiversShared {
+    slots: Vec<Option<WalReceiverState>>,
+}
+
+/// Walreceiver status. Currently only whether it passed voting stage and
+/// started receiving the stream, but it is easy to add more if needed.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub enum WalReceiverState {
+    Voting,
+    Streaming,
+}
+
+/// Scope guard to access slot in WalSenders registry and unregister from it in
+/// Drop.
+pub struct WalReceiverGuard {
+    id: WalReceiverId,
+    walreceivers: Arc<WalReceivers>,
+}
+
+impl WalReceiverGuard {
+    /// Get reference to locked shared state contents.
+    fn get(&self) -> MappedMutexGuard<WalReceiverState> {
+        self.walreceivers.get_slot(self.id)
+    }
+}
+
+impl Drop for WalReceiverGuard {
+    fn drop(&mut self) {
+        self.walreceivers.unregister(self.id);
+    }
+}
+
 const MSG_QUEUE_SIZE: usize = 256;
 const REPLY_QUEUE_SIZE: usize = 16;

@@ -246,10 +350,13 @@ impl WalAcceptor {
    /// it must mean that network thread terminated.
    async fn run(&mut self) -> anyhow::Result<()> {
        // Register the connection and defer unregister.
-        self.tli.on_compute_connect().await?;
-        let _guard = ComputeConnectionGuard {
+        // Order of the next two lines is important: we want first to remove our entry and then
+        // update status which depends on registered connections.
+        let _compute_conn_guard = ComputeConnectionGuard {
            timeline: Arc::clone(&self.tli),
        };
+        let walreceiver_guard = self.tli.get_walreceivers().register();
+        self.tli.update_status_notify().await?;

        // After this timestamp we will stop processing AppendRequests and send a response
        // to the walproposer. walproposer sends at least one AppendRequest per second,
@@ -263,6 +370,11 @@ impl WalAcceptor {
            }
            let mut next_msg = opt_msg.unwrap();

+            // Update walreceiver state in shmem for reporting.
+            if let ProposerAcceptorMessage::Elected(_) = &next_msg {
+                *walreceiver_guard.get() = WalReceiverState::Streaming;
+            }
+
            let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) {
                // loop through AppendRequest's while it's readily available to
                // write as many WAL as possible without fsyncing
@@ -311,6 +423,7 @@ impl WalAcceptor {
    }
 }

+/// Calls update_status_notify in drop to update timeline status.
 struct ComputeConnectionGuard {
    timeline: Arc<Timeline>,
 }
@@ -318,11 +431,9 @@ struct ComputeConnectionGuard {
 impl Drop for ComputeConnectionGuard {
    fn drop(&mut self) {
        let tli = self.timeline.clone();
-        // tokio forbids to call blocking_send inside the runtime, and see
-        // comments in on_compute_disconnect why we call blocking_send.
        tokio::spawn(async move {
-            if let Err(e) = tli.on_compute_disconnect().await {
-                error!("failed to unregister compute connection: {}", e);
+            if let Err(e) = tli.update_status_notify().await {
+                error!("failed to update timeline status: {}", e);
            }
        });
    }
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -23,6 +23,7 @@ use utils::{
 use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;

+use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{
    AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState,
    SafekeeperMemState, ServerInfo, Term,
@@ -164,8 +165,8 @@ impl SharedState {
        })
    }

-    fn is_active(&self, remote_consistent_lsn: Lsn) -> bool {
-        self.is_wal_backup_required()
+    fn is_active(&self, num_computes: usize, remote_consistent_lsn: Lsn) -> bool {
+        self.is_wal_backup_required(num_computes)
            // FIXME: add tracking of relevant pageservers and check them here individually,
            // otherwise migration won't work (we suspend too early).
            || remote_consistent_lsn < self.sk.inmem.commit_lsn
@@ -173,29 +174,34 @@ impl SharedState {

    /// Mark timeline active/inactive and return whether s3 offloading requires
    /// start/stop action.
-    fn update_status(&mut self, remote_consistent_lsn: Lsn, ttid: TenantTimelineId) -> bool {
-        let is_active = self.is_active(remote_consistent_lsn);
+    fn update_status(
+        &mut self,
+        num_computes: usize,
+        remote_consistent_lsn: Lsn,
+        ttid: TenantTimelineId,
+    ) -> bool {
+        let is_active = self.is_active(num_computes, remote_consistent_lsn);
        if self.active != is_active {
            info!("timeline {} active={} now", ttid, is_active);
        }
        self.active = is_active;
-        self.is_wal_backup_action_pending()
+        self.is_wal_backup_action_pending(num_computes)
    }

    /// Should we run s3 offloading in current state?
-    fn is_wal_backup_required(&self) -> bool {
+    fn is_wal_backup_required(&self, num_computes: usize) -> bool {
        let seg_size = self.get_wal_seg_size();
-        self.num_computes > 0 ||
+        num_computes > 0 ||
        // Currently only the whole segment is offloaded, so compare segment numbers.
-               (self.sk.inmem.commit_lsn.segment_number(seg_size) >
-                self.sk.inmem.backup_lsn.segment_number(seg_size))
+            (self.sk.inmem.commit_lsn.segment_number(seg_size) >
+             self.sk.inmem.backup_lsn.segment_number(seg_size))
    }

    /// Is current state of s3 offloading is not what it ought to be?
-    fn is_wal_backup_action_pending(&self) -> bool {
-        let res = self.wal_backup_active != self.is_wal_backup_required();
+    fn is_wal_backup_action_pending(&self, num_computes: usize) -> bool {
+        let res = self.wal_backup_active != self.is_wal_backup_required(num_computes);
        if res {
-            let action_pending = if self.is_wal_backup_required() {
+            let action_pending = if self.is_wal_backup_required(num_computes) {
                "start"
            } else {
                "stop"
@@ -210,8 +216,8 @@ impl SharedState {

    /// Returns whether s3 offloading is required and sets current status as
    /// matching.
-    fn wal_backup_attend(&mut self) -> bool {
-        self.wal_backup_active = self.is_wal_backup_required();
+    fn wal_backup_attend(&mut self, num_computes: usize) -> bool {
+        self.wal_backup_active = self.is_wal_backup_required(num_computes);
        self.wal_backup_active
    }

@@ -295,6 +301,7 @@ pub struct Timeline {
    /// while holding it, ensuring that consensus checks are in order.
    mutex: Mutex<SharedState>,
    walsenders: Arc<WalSenders>,
+    walreceivers: Arc<WalReceivers>,

    /// Cancellation channel. Delete/cancel will send `true` here as a cancellation signal.
    cancellation_tx: watch::Sender<bool>,
@@ -329,6 +336,7 @@ impl Timeline {
            commit_lsn_watch_rx,
            mutex: Mutex::new(shared_state),
            walsenders: WalSenders::new(rcl),
+            walreceivers: WalReceivers::new(),
            cancellation_rx,
            cancellation_tx,
            timeline_dir: conf.timeline_dir(&ttid),
@@ -355,6 +363,7 @@ impl Timeline {
            commit_lsn_watch_rx,
            mutex: Mutex::new(SharedState::create_new(&conf, &ttid, state)?),
            walsenders: WalSenders::new(Lsn(0)),
+            walreceivers: WalReceivers::new(),
            cancellation_rx,
            cancellation_tx,
            timeline_dir: conf.timeline_dir(&ttid),
@@ -441,40 +450,22 @@ impl Timeline {
    }

    fn update_status(&self, shared_state: &mut SharedState) -> bool {
-        shared_state.update_status(self.get_walsenders().get_remote_consistent_lsn(), self.ttid)
+        shared_state.update_status(
+            self.walreceivers.get_num(),
+            self.get_walsenders().get_remote_consistent_lsn(),
+            self.ttid,
+        )
    }

-    /// Register compute connection, starting timeline-related activity if it is
-    /// not running yet.
-    pub async fn on_compute_connect(&self) -> Result<()> {
+    /// Update timeline status and kick wal backup launcher to stop/start offloading if needed.
+    pub async fn update_status_notify(&self) -> Result<()> {
        if self.is_cancelled() {
            bail!(TimelineError::Cancelled(self.ttid));
        }
-
-        let is_wal_backup_action_pending: bool;
-        {
+        let is_wal_backup_action_pending: bool = {
            let mut shared_state = self.write_shared_state().await;
-            shared_state.num_computes += 1;
-            is_wal_backup_action_pending = self.update_status(&mut shared_state);
-        }
-        // Wake up wal backup launcher, if offloading not started yet.
-        if is_wal_backup_action_pending {
-            // Can fail only if channel to a static thread got closed, which is not normal at all.
-            self.wal_backup_launcher_tx.send(self.ttid).await?;
-        }
-        Ok(())
-    }
-
-    /// De-register compute connection, shutting down timeline activity if
-    /// pageserver doesn't need catchup.
-    pub async fn on_compute_disconnect(&self) -> Result<()> {
-        let is_wal_backup_action_pending: bool;
-        {
-            let mut shared_state = self.write_shared_state().await;
-            shared_state.num_computes -= 1;
-            is_wal_backup_action_pending = self.update_status(&mut shared_state);
-        }
-        // Wake up wal backup launcher, if it is time to stop the offloading.
+            self.update_status(&mut shared_state)
+        };
        if is_wal_backup_action_pending {
            // Can fail only if channel to a static thread got closed, which is not normal at all.
            self.wal_backup_launcher_tx.send(self.ttid).await?;
@@ -519,7 +510,9 @@ impl Timeline {
            return false;
        }

-        self.write_shared_state().await.wal_backup_attend()
+        self.write_shared_state()
+            .await
+            .wal_backup_attend(self.walreceivers.get_num())
    }

    /// Returns commit_lsn watch channel.
@@ -650,6 +643,10 @@ impl Timeline {
        &self.walsenders
    }

+    pub fn get_walreceivers(&self) -> &Arc<WalReceivers> {
+        &self.walreceivers
+    }
+
    /// Returns flush_lsn.
    pub async fn get_flush_lsn(&self) -> Lsn {
        self.write_shared_state().await.sk.wal_store.flush_lsn()
--- a/scripts/combine_control_files.py
+++ b/scripts/combine_control_files.py
@@ -1,76 +0,0 @@
-#! /usr/bin/env python3
-# Script to generate ext_index.json metadata file
-# that stores content of the control files and location of extension archives
-# for all extensions in extensions subdir.
-import argparse
-import json
-import subprocess
-from pathlib import Path
-
-"""
-# ext_index.json example:
-{
-    "public_extensions": [
-        "anon"
-    ],
-    "library_index": {
-        "anon": "anon",
-        // for more complex extensions like postgis
-        // we might have something like:
-        // address_standardizer: postgis
-        // postgis_tiger: postgis
-    },
-    "extension_data": {
-        "anon": {
-            "control_data": {
-                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
-            },
-            "archive_path": "5648391853/v15/extensions/anon.tar.zst"
-        }
-    }
-}
-"""
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="generate ext_index.json")
-    parser.add_argument("pg_version", type=str, choices=["v14", "v15"], help="pg_version")
-    parser.add_argument("BUILD_TAG", type=str, help="BUILD_TAG for this compute image")
-    parser.add_argument("--public_extensions", type=str, help="list of public extensions")
-    args = parser.parse_args()
-    pg_version = args.pg_version
-    BUILD_TAG = args.BUILD_TAG
-    public_ext_list = args.public_extensions.split(",")
-
-    ext_index = {}
-    library_index = {}
-    EXT_PATH = Path("extensions")
-    for extension in EXT_PATH.iterdir():
-        if extension.is_dir():
-            control_data = {}
-            for control_file in extension.glob("*.control"):
-                if control_file.suffix != ".control":
-                    continue
-                with open(control_file, "r") as f:
-                    control_data[control_file.name] = f.read()
-            ext_index[extension.name] = {
-                "control_data": control_data,
-                "archive_path": f"{BUILD_TAG}/{pg_version}/extensions/{extension.name}.tar.zst",
-            }
-        elif extension.suffix == ".zst":
-            file_list = (
-                str(subprocess.check_output(["tar", "tf", str(extension)]), "utf-8")
-                .strip()
-                .split("\n")
-            )
-            for file in file_list:
-                if file.endswith(".so") and file.startswith("lib/"):
-                    lib_name = file[4:-3]
-                    library_index[lib_name] = extension.name.replace(".tar.zst", "")
-
-    all_data = {
-        "public_extensions": public_ext_list,
-        "library_index": library_index,
-        "extension_data": ext_index,
-    }
-    with open("ext_index.json", "w") as f:
-        json.dump(all_data, f)
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -70,6 +70,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
    "pageserver_getpage_reconstruct_seconds_count",
    "pageserver_getpage_reconstruct_seconds_sum",
    *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
+    *histogram("pageserver_smgr_query_seconds_global"),
    *histogram("pageserver_read_num_fs_layers"),
    *histogram("pageserver_getpage_get_reconstruct_data_seconds"),
    *histogram("pageserver_wait_lsn_seconds"),
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1524,7 +1524,7 @@ class NeonPageserver(PgProtocol):
            ".*wait for layer upload ops to complete.*",  # .*Caused by:.*wait_completion aborted because upload queue was stopped
            ".*gc_loop.*Gc failed, retrying in.*timeline is Stopping",  # When gc checks timeline state after acquiring layer_removal_cs
            ".*gc_loop.*Gc failed, retrying in.*: Cannot run GC iteration on inactive tenant",  # Tenant::gc precondition
-            ".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping",  # When compaction checks timeline state after acquiring layer_removal_cs
+            ".*compaction_loop.*Compaction failed, retrying in.*timeline or pageserver is shutting down",  # When compaction checks timeline state after acquiring layer_removal_cs
            ".*query handler for 'pagestream.*failed: Timeline .* was not found",  # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
            ".*query handler for 'pagestream.*failed: Timeline .* is not active",  # timeline delete in progress
            ".*task iteration took longer than the configured period.*",
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -22,7 +22,7 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv:

    # eviction might be the first one after an attach to access the layers
    env.pageserver.allowed_errors.append(
-        ".*unexpectedly on-demand downloading remote layer remote.* for task kind Eviction"
+        ".*unexpectedly on-demand downloading remote layer .* for task kind Eviction"
    )
    assert isinstance(env.remote_storage, LocalFsStorage)
    return env
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -15,7 +15,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):

    env.pageserver.allowed_errors.extend(
        [
-            ".*Failed to load delta layer.*",
+            ".*layer loading failed:.*",
            ".*could not find data for key.*",
            ".*is not active. Current state: Broken.*",
            ".*will not become active. Current state: Broken.*",
@@ -99,7 +99,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
    # Third timeline will also fail during basebackup, because the layer file is corrupt.
    # It will fail when we try to read (and reconstruct) a page from it, ergo the error message.
    # (We don't check layer file contents on startup, when loading the timeline)
-    with pytest.raises(Exception, match="Failed to load delta layer") as err:
+    with pytest.raises(Exception, match="layer loading failed:") as err:
        pg3.start()
    log.info(
        f"As expected, compute startup failed for timeline {tenant3}/{timeline3} with corrupt layers: {err}"
--- a/test_runner/regress/test_duplicate_layers.py
+++ b/test_runner/regress/test_duplicate_layers.py
@@ -2,35 +2,137 @@ import time

 import pytest
 from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
+from fixtures.pageserver.utils import wait_for_upload_queue_empty
+from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
+from requests.exceptions import ConnectionError


-# Test duplicate layer detection
-#
-# This test sets fail point at the end of first compaction phase:
-# after flushing new L1 layers but before deletion of L0 layers
-# it should cause generation of duplicate L1 layer by compaction after restart.
@pytest.mark.timeout(600)
-def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
-    env = neon_env_builder.init_start()
-    pageserver_http = env.pageserver.http_client()
+def test_compaction_duplicates_all(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    """
+    Makes compact_level0_phase1 return input layers as the output layers with a
+    failpoint as if those L0 inputs would had all been recreated when L1s were
+    supposed to be created.
+    """
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=RemoteStorageKind.LOCAL_FS,
+        test_name="test_compaction_duplicates_all",
+    )

-    # Use aggressive compaction and checkpoint settings
-    tenant_id, _ = env.neon_cli.create_tenant(
-        conf={
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
            "checkpoint_distance": f"{1024 ** 2}",
            "compaction_target_size": f"{1024 ** 2}",
-            "compaction_period": "5 s",
+            "compaction_period": "0 s",
            "compaction_threshold": "3",
        }
    )
+    pageserver_http = env.pageserver.http_client()
+
+    tenant_id, timeline_id = env.initial_tenant, env.initial_timeline

    pageserver_http.configure_failpoints(("compact-level0-phase1-return-same", "return"))
+    # pageserver_http.configure_failpoints(("after-timeline-compacted-first-L1", "exit"))

    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
    connstr = endpoint.connstr(options="-csynchronous_commit=off")
    pg_bin.run_capture(["pgbench", "-i", "-s1", connstr])

-    time.sleep(10)  # let compaction to be performed
+    pageserver_http.timeline_compact(tenant_id, timeline_id)
    assert env.pageserver.log_contains("compact-level0-phase1-return-same")

-    pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T200", "-Mprepared", connstr])
+
+def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    """
+    This test sets fail point at the end of first compaction phase:
+    after flushing new L1 layers but before deletion of L0 layers
+    it should cause generation of duplicate L1 layer by compaction after restart.
+    """
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=RemoteStorageKind.LOCAL_FS,
+        test_name="test_duplicate_layers",
+    )
+
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "checkpoint_distance": f"{1024 ** 2}",
+            "compaction_target_size": f"{1024 ** 2}",
+            "compaction_period": "0 s",
+            "compaction_threshold": "3",
+        }
+    )
+    pageserver_http = env.pageserver.http_client()
+
+    tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
+
+    pageserver_http.configure_failpoints(("after-timeline-compacted-first-L1", "exit"))
+
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    connstr = endpoint.connstr(options="-csynchronous_commit=off")
+    pg_bin.run_capture(["pgbench", "-i", "-s1", connstr])
+
+    with pytest.raises(ConnectionError, match="Remote end closed connection without response"):
+        pageserver_http.timeline_compact(tenant_id, timeline_id)
+
+    # pageserver has already exited at this point
+    env.pageserver.stop()
+
+    # now the duplicate L1 has been created, but is not yet uploaded
+    assert isinstance(env.remote_storage, LocalFsStorage)
+
+    # path = env.remote_storage.timeline_path(tenant_id, timeline_id)
+    l1_found = None
+    for path in env.timeline_dir(tenant_id, timeline_id).iterdir():
+        if path.name == "metadata" or path.name.startswith("ephemeral-"):
+            continue
+
+        if len(path.suffixes) > 0:
+            # temp files
+            continue
+
+        [key_range, lsn_range] = path.name.split("__", maxsplit=1)
+
+        if "-" not in lsn_range:
+            # image layer
+            continue
+
+        [key_start, key_end] = key_range.split("-", maxsplit=1)
+
+        if key_start == "0" * 36 and key_end == "F" * 36:
+            # L0
+            continue
+
+        assert l1_found is None, f"found multiple L1: {l1_found.name} and {path.name}"
+        l1_found = path
+
+    assert l1_found is not None, "failed to find L1 locally"
+    original_created_at = l1_found.stat()[8]
+
+    uploaded = env.remote_storage.timeline_path(tenant_id, timeline_id) / l1_found.name
+    assert not uploaded.exists(), "to-be-overwritten should not yet be uploaded"
+
+    # give room for fs timestamps
+    time.sleep(1)
+
+    env.pageserver.start()
+    warning = f".*duplicated L1 layer layer={l1_found.name}"
+    env.pageserver.allowed_errors.append(warning)
+
+    pageserver_http.timeline_compact(tenant_id, timeline_id)
+    # give time for log flush
+    time.sleep(1)
+
+    env.pageserver.log_contains(warning)
+
+    overwritten_at = l1_found.stat()[8]
+    assert original_created_at < overwritten_at, "expected the L1 to be overwritten"
+
+    wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
+
+    uploaded_at = uploaded.stat()[8]
+    assert overwritten_at <= uploaded_at, "expected the L1 to finally be uploaded"
+
+    # why does compaction not wait for uploads? probably so that we can compact
+    # faster than we can upload in some cases.
+    #
+    # timeline_compact should wait for uploads as well
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -256,34 +256,34 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
    ps_http.evict_all_layers(tenant_id, timeline_id)

    def ensure_resident_and_remote_size_metrics():
-        log.info("ensure that all the layers are gone")
        resident_layers = list(env.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
        # we have disabled all background loops, so, this should hold
-        assert len(resident_layers) == 0
+        assert len(resident_layers) == 0, "ensure that all the layers are gone"

        info = ps_http.layer_map_info(tenant_id, timeline_id)
        log.info("layer map dump: %s", info)

-        log.info("ensure that resident_physical_size metric is zero")
        resident_physical_size_metric = ps_http.get_timeline_metric(
            tenant_id, timeline_id, "pageserver_resident_physical_size"
        )
-        assert resident_physical_size_metric == 0
-        log.info("ensure that resident_physical_size metric corresponds to layer map dump")
+        assert (
+            resident_physical_size_metric == 0
+        ), "ensure that resident_physical_size metric is zero"
        assert resident_physical_size_metric == sum(
-            [layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote]
-        )
+            layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote
+        ), "ensure that resident_physical_size metric corresponds to layer map dump"

-        log.info("ensure that remote_physical_size metric matches layer map")
        remote_physical_size_metric = ps_http.get_timeline_metric(
            tenant_id, timeline_id, "pageserver_remote_physical_size"
        )
-        log.info("ensure that remote_physical_size metric corresponds to layer map dump")
        assert remote_physical_size_metric == sum(
            layer.layer_file_size or 0 for layer in info.historic_layers if layer.remote
-        )
+        ), "ensure that remote_physical_size metric corresponds to layer map dump"

    log.info("before runnning GC, ensure that remote_physical size is zero")
+    # leaving index_part.json upload from successful compaction out will show
+    # up here as a mismatch between remove_physical_size and summed up layermap
+    # size
    ensure_resident_and_remote_size_metrics()

    log.info("run GC")
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -13,13 +13,12 @@ from fixtures.neon_fixtures import (
    last_flush_lsn_upload,
    wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
+from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
    assert_tenant_state,
    wait_for_last_record_lsn,
    wait_for_upload,
    wait_for_upload_queue_empty,
-    wait_until_tenant_state,
 )
 from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
 from fixtures.types import Lsn
@@ -369,7 +368,7 @@ def test_download_remote_layers_api(
    filled_current_physical = get_api_current_physical_size()
    log.info(filled_current_physical)
    filled_size = get_resident_physical_size()
-    log.info(filled_size)
+    log.info(f"filled_size: {filled_size}")
    assert filled_current_physical == filled_size, "we don't yet do layer eviction"

    env.pageserver.stop()
@@ -377,7 +376,7 @@ def test_download_remote_layers_api(
    # remove all the layer files
    # XXX only delete some of the layer files, to show that it really just downloads all the layers
    for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"):
-        log.info(f"unlinking layer {layer}")
+        log.info(f"unlinking layer {layer.name}")
        layer.unlink()

    # Shut down safekeepers before starting the pageserver.
@@ -391,7 +390,7 @@ def test_download_remote_layers_api(
    env.pageserver.start(extra_env_vars={"FAILPOINTS": "remote-storage-download-pre-rename=return"})
    env.pageserver.allowed_errors.extend(
        [
-            f".*download_all_remote_layers.*{tenant_id}.*{timeline_id}.*layer download failed.*remote-storage-download-pre-rename failpoint",
+            ".*download failed: downloading evicted layer file failed.*",
            f".*initial size calculation.*{tenant_id}.*{timeline_id}.*Failed to calculate logical size",
        ]
    )
@@ -403,7 +402,7 @@ def test_download_remote_layers_api(
        filled_current_physical == get_api_current_physical_size()
    ), "current_physical_size is sum of loaded layer sizes, independent of whether local or remote"
    post_unlink_size = get_resident_physical_size()
-    log.info(post_unlink_size)
+    log.info(f"post_unlink_size: {post_unlink_size}")
    assert (
        post_unlink_size < filled_size
    ), "we just deleted layers and didn't cause anything to re-download them yet"
@@ -658,62 +657,5 @@ def test_compaction_downloads_on_demand_with_image_creation(
    assert dict(kinds_after) == {"Delta": 4, "Image": 1}


-@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
-def test_ondemand_download_failure_to_replace(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
-):
-    """
-    Make sure that we fail on being unable to replace a RemoteLayer instead of for example livelocking.
-
-    See: https://github.com/neondatabase/neon/issues/3533
-    """
-
-    neon_env_builder.enable_remote_storage(
-        remote_storage_kind=remote_storage_kind,
-        test_name="test_ondemand_download_failure_to_replace",
-    )
-
-    # disable gc and compaction via default tenant config because config is lost while detaching
-    # so that compaction will not be the one to download the layer but the http handler is
-    neon_env_builder.pageserver_config_override = (
-        """tenant_config={gc_period = "0s", compaction_period = "0s"}"""
-    )
-
-    env = neon_env_builder.init_start()
-
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-    assert timeline_id is not None
-
-    pageserver_http = env.pageserver.http_client()
-
-    # remove layers so that they will be redownloaded
-    pageserver_http.tenant_detach(tenant_id)
-    pageserver_http.tenant_attach(tenant_id)
-
-    wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
-    pageserver_http.configure_failpoints(("layermap-replace-notfound", "return"))
-
-    # requesting details with non-incremental size should trigger a download of the only layer
-    # this will need to be adjusted if an index for logical sizes is ever implemented
-    with pytest.raises(PageserverApiException):
-        # PageserverApiException is expected because of the failpoint (timeline_detail building does something)
-        # ReadTimeout can happen on our busy CI, but it should not, because there is no more busylooping
-        # but should it be added back, we would wait for 15s here.
-        pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=15)
-
-    actual_message = ".* ERROR .*layermap-replace-notfound"
-    assert env.pageserver.log_contains(actual_message) is not None
-    env.pageserver.allowed_errors.append(actual_message)
-
-    env.pageserver.allowed_errors.append(
-        ".* ERROR .*Error processing HTTP request: InternalServerError\\(get local timeline info"
-    )
-    # this might get to run and attempt on-demand, but not always
-    env.pageserver.allowed_errors.append(".* ERROR .*Task 'initial size calculation'")
-
-    # if the above returned, then we didn't have a livelock, and all is well
-
-
 def stringify(conf: Dict[str, Any]) -> Dict[str, str]:
    return dict(map(lambda x: (x[0], str(x[1])), conf.items()))
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -603,7 +603,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
    log.info("sending delete request")
    checkpoint_allowed_to_fail.set()
    env.pageserver.allowed_errors.append(
-        ".* ERROR .*Error processing HTTP request: InternalServerError\\(timeline is Stopping"
+        ".* ERROR .*Error processing HTTP request: InternalServerError\\(The timeline or pageserver is shutting down"
    )

    # Generous timeout, because currently deletions can get blocked waiting for compaction
@@ -861,10 +861,8 @@ def test_compaction_delete_before_upload(
    # Ensure that this actually terminates
    wait_upload_queue_empty(client, tenant_id, timeline_id)

-    # For now we are hitting this message.
-    # Maybe in the future the underlying race condition will be fixed,
-    # but until then, ensure that this message is hit instead.
-    assert env.pageserver.log_contains(
+    # fixed in #4938
+    assert not env.pageserver.log_contains(
        "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."
    )

--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -688,7 +688,7 @@ def test_ignored_tenant_stays_broken_without_metadata(
    # temporarily detached produces these errors in the pageserver log.
    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
    env.pageserver.allowed_errors.append(
-        f".*Tenant {tenant_id} will not become active\\. Current state: Broken.*"
+        f".*Tenant {tenant_id} will not become active\\. Current state: (Stopping|Broken).*"
    )

    # ignore the tenant and remove its metadata
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -239,9 +239,7 @@ def test_tenant_redownloads_truncated_file_on_startup(

    assert isinstance(env.remote_storage, LocalFsStorage)

-    env.pageserver.allowed_errors.append(
-        ".*removing local file .* because it has unexpected length.*"
-    )
+    env.pageserver.allowed_errors.append(".*removing local file .* because .*")

    # FIXME: Are these expected?
    env.pageserver.allowed_errors.append(
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -14,11 +14,13 @@ publish = false
 ### BEGIN HAKARI SECTION
 [dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
+axum = { version = "0.6", features = ["ws"] }
 bytes = { version = "1", features = ["serde"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] }
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
+digest = { version = "0.10", features = ["mac", "std"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 futures = { version = "0.3" }
@@ -27,6 +29,7 @@ futures-core = { version = "0.3" }
 futures-executor = { version = "0.3" }
 futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
+hyper = { version = "0.14", features = ["full"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
@@ -45,6 +48,7 @@ rustls = { version = "0.20", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["raw_value"] }
+smallvec = { version = "1", default-features = false, features = ["write"] }
 socket2 = { version = "0.4", default-features = false, features = ["all"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-rustls = { version = "0.23" }
@@ -54,7 +58,6 @@ toml_edit = { version = "0.19", features = ["serde"] }
 tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
-tracing-subscriber = { version = "0.3", default-features = false, features = ["env-filter", "fmt", "json", "smallvec", "tracing-log"] }
 url = { version = "2", features = ["serde"] }

 [build-dependencies]