alphabetize

allow v16
2026-02-03 02:30:37 +00:00 · 2023-08-23 13:24:42 -04:00 · 2023-08-23 13:23:51 -04:00
53 changed files with 1008 additions and 3371 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -145,11 +145,7 @@ runs:

        if [ "${RERUN_FLAKY}" == "true" ]; then
          mkdir -p $TEST_OUTPUT
-          poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" \
-                                              --days 7 \
-                                              --output "$TEST_OUTPUT/flaky.json" \
-                                              --pg-version "${DEFAULT_PG_VERSION}" \
-                                              --build-type "${BUILD_TYPE}"
+          poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/flaky.json"

          EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
        fi
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -737,6 +737,34 @@ jobs:
                           --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
                           --cleanup

+      # Due to a kaniko bug, we can't use cache for extensions image, thus it takes about the same amount of time as compute-node image to build (~10 min)
+      # During the transition period we need to have extensions in both places (in S3 and in compute-node image),
+      # so we won't build extension twice, but extract them from compute-node.
+      #
+      # For now we use extensions image only for new custom extensitons
+      - name: Kaniko build extensions only
+        run: |
+          # Kaniko is suposed to clean up after itself if --cleanup flag is set, but it doesn't.
+          # Despite some fixes were made in https://github.com/GoogleContainerTools/kaniko/pull/2504 (in kaniko v1.11.0),
+          # it still fails with error:
+          #   error building image: could not save file: copying file: symlink postgres /kaniko/1/usr/local/pgsql/bin/postmaster: file exists
+          #
+          # Ref https://github.com/GoogleContainerTools/kaniko/issues/1406
+          find /kaniko -maxdepth 1 -mindepth 1 -type d -regex "/kaniko/[0-9]*" -exec rm -rv {} \;
+
+          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
+                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
+                           --context . \
+                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} \
+                           --build-arg PG_VERSION=${{ matrix.version }} \
+                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
+                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
+                           --dockerfile Dockerfile.compute-node \
+                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
+                           --destination neondatabase/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
+                           --cleanup \
+                           --target postgres-extensions
+
      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr
@@ -752,7 +780,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.17.5
+      VM_BUILDER_VERSION: v0.16.3

    steps:
      - name: Checkout
@@ -775,7 +803,7 @@ jobs:
        run: |
          ./vm-builder \
            -enable-file-cache \
-            -cgroup-uid=postgres \
+            -enable-monitor \
            -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
            -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

@@ -858,8 +886,10 @@ jobs:
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} latest

      - name: Push images to production ECR
        if: |
@@ -870,8 +900,10 @@ jobs:
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:latest

      - name: Configure Docker Hub login
        run: |
@@ -893,8 +925,10 @@ jobs:
          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/extensions-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/extensions-v15:${{needs.tag.outputs.build-tag}} latest

      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr
@@ -904,7 +938,7 @@ jobs:
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
-    needs: [ tag ]
+    needs: [ promote-images, tag ]
    steps:
      - name: Set PR's status to pending and request a remote CI test
        run: |
@@ -939,10 +973,57 @@ jobs:
              }
            }"

+  upload-postgres-extensions-to-s3:
+    if: |
+      (github.ref_name == 'main' || github.ref_name == 'release') &&
+       github.event_name != 'workflow_dispatch'
+    runs-on: ${{ github.ref_name == 'release' && fromJSON('["self-hosted", "prod", "x64"]') || fromJSON('["self-hosted", "gen3", "small"]') }}
+    needs: [ tag, promote-images ]
+    strategy:
+      fail-fast: false
+      matrix:
+        version: [ v14, v15 ]
+
+    env:
+      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ github.ref_name == 'release' && 'latest' || needs.tag.outputs.build-tag }}
+      AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
+      AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
+      S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}
+
+    steps:
+      - name: Pull postgres-extensions image
+        run: |
+          docker pull ${EXTENSIONS_IMAGE}
+
+      - name: Create postgres-extensions container
+        id: create-container
+        run: |
+          EID=$(docker create ${EXTENSIONS_IMAGE} true)
+          echo "EID=${EID}" >> $GITHUB_OUTPUT
+
+      - name: Extract postgres-extensions from container
+        run: |
+          rm -rf ./extensions-to-upload # Just in case
+          mkdir -p extensions-to-upload
+
+          docker cp ${{ steps.create-container.outputs.EID }}:/extensions/ ./extensions-to-upload/
+          docker cp ${{ steps.create-container.outputs.EID }}:/ext_index.json ./extensions-to-upload/
+
+      - name: Upload postgres-extensions to S3
+        run: |
+          for BUCKET in $(echo ${S3_BUCKETS:-[]} | jq --raw-output '.[]'); do
+            aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
+          done
+
+      - name: Cleanup
+        if: ${{ always() && steps.create-container.outputs.EID }}
+        run: |
+          docker rm ${{ steps.create-container.outputs.EID }} || true
+
  deploy:
    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    needs: [ promote-images, tag, regress-tests ]
+    needs: [ upload-postgres-extensions-to-s3, promote-images, tag, regress-tests ]
    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
    steps:
      - name: Fix git ownership
--- a/13
+++ b/13
@@ -1,12 +1,11 @@
-/compute_tools/ @neondatabase/control-plane @neondatabase/compute
+/compute_tools/ @neondatabase/control-plane
 /control_plane/ @neondatabase/compute @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
-/libs/postgres_ffi/ @neondatabase/compute
-/libs/remote_storage/ @neondatabase/storage
-/libs/safekeeper_api/ @neondatabase/safekeepers
-/libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
-/pageserver/ @neondatabase/compute @neondatabase/storage
+/libs/postgres_ffi/ @neondatabase/compute 
+/libs/remote_storage/ @neondatabase/storage 
+/libs/safekeeper_api/ @neondatabase/safekeepers  
+/pageserver/ @neondatabase/compute @neondatabase/storage 
 /pgxn/ @neondatabase/compute
-/proxy/ @neondatabase/proxy
+/proxy/ @neondatabase/control-plane 
 /safekeeper/ @neondatabase/safekeepers
 /vendor/ @neondatabase/compute
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -190,7 +190,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.16",
 ]

 [[package]]
@@ -201,7 +201,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.16",
 ]

 [[package]]
@@ -553,13 +553,12 @@ dependencies = [

 [[package]]
 name = "axum"
-version = "0.6.20"
+version = "0.6.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf"
+checksum = "f8175979259124331c1d7bf6586ee7e0da434155e4b2d48ec2c8386281d8df39"
 dependencies = [
 "async-trait",
 "axum-core",
- "base64 0.21.1",
 "bitflags",
 "bytes",
 "futures-util",
@@ -574,13 +573,7 @@ dependencies = [
 "pin-project-lite",
 "rustversion",
 "serde",
- "serde_json",
- "serde_path_to_error",
- "serde_urlencoded",
- "sha1",
 "sync_wrapper",
- "tokio",
- "tokio-tungstenite 0.20.0",
 "tower",
 "tower-layer",
 "tower-service",
@@ -680,7 +673,7 @@ dependencies = [
 "regex",
 "rustc-hash",
 "shlex",
- "syn 2.0.28",
+ "syn 2.0.16",
 "which",
 ]

@@ -772,19 +765,6 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"

-[[package]]
-name = "cgroups-rs"
-version = "0.3.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1fb3af90c8d48ad5f432d8afb521b5b40c2a2fce46dd60e05912de51c47fba64"
-dependencies = [
- "libc",
- "log",
- "nix 0.25.1",
- "regex",
- "thiserror",
-]
-
 [[package]]
 name = "chrono"
 version = "0.4.24"
@@ -869,7 +849,7 @@ dependencies = [
 "heck",
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.16",
 ]

 [[package]]
@@ -927,7 +907,6 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-compression",
- "cfg-if",
 "chrono",
 "clap",
 "compute_api",
@@ -946,7 +925,6 @@ dependencies = [
 "tar",
 "tokio",
 "tokio-postgres",
- "tokio-util",
 "toml_edit",
 "tracing",
 "tracing-opentelemetry",
@@ -954,7 +932,6 @@ dependencies = [
 "tracing-utils",
 "url",
 "utils",
- "vm_monitor",
 "workspace_hack",
 "zstd",
 ]
@@ -1001,7 +978,7 @@ dependencies = [
 "comfy-table",
 "compute_api",
 "git-version",
- "nix 0.26.2",
+ "nix",
 "once_cell",
 "pageserver_api",
 "postgres",
@@ -1207,7 +1184,7 @@ dependencies = [
 "proc-macro2",
 "quote",
 "strsim",
- "syn 2.0.28",
+ "syn 2.0.16",
 ]

 [[package]]
@@ -1218,7 +1195,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a"
 dependencies = [
 "darling_core",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.16",
 ]

 [[package]]
@@ -1283,7 +1260,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.16",
 ]

 [[package]]
@@ -1339,7 +1316,7 @@ dependencies = [
 "darling",
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.16",
 ]

 [[package]]
@@ -1535,7 +1512,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.16",
 ]

 [[package]]
@@ -1886,8 +1863,8 @@ dependencies = [
 "hyper",
 "pin-project",
 "tokio",
- "tokio-tungstenite 0.18.0",
- "tungstenite 0.18.0",
+ "tokio-tungstenite",
+ "tungstenite",
 ]

 [[package]]
@@ -1951,19 +1928,6 @@ dependencies = [
 "libc",
 ]

-[[package]]
-name = "inotify"
-version = "0.10.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fdd168d97690d0b8c412d6b6c10360277f4d7ee495c5d0d5d5fe0854923255cc"
-dependencies = [
- "bitflags",
- "futures-core",
- "inotify-sys",
- "libc",
- "tokio",
-]
-
 [[package]]
 name = "inotify-sys"
 version = "0.1.5"
@@ -2287,18 +2251,6 @@ dependencies = [
 "tempfile",
 ]

-[[package]]
-name = "nix"
-version = "0.25.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f346ff70e7dbfd675fe90590b92d59ef2de15a8779ae305ebcbfd3f0caf59be4"
-dependencies = [
- "autocfg",
- "bitflags",
- "cfg-if",
- "libc",
-]
-
 [[package]]
 name = "nix"
 version = "0.26.2"
@@ -2333,7 +2285,7 @@ dependencies = [
 "crossbeam-channel",
 "filetime",
 "fsevent-sys",
- "inotify 0.9.6",
+ "inotify",
 "kqueue",
 "libc",
 "mio",
@@ -2341,15 +2293,6 @@ dependencies = [
 "windows-sys 0.45.0",
 ]

-[[package]]
-name = "ntapi"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4"
-dependencies = [
- "winapi",
-]
-
 [[package]]
 name = "num-bigint"
 version = "0.4.3"
@@ -2443,7 +2386,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.16",
 ]

 [[package]]
@@ -2630,7 +2573,7 @@ dependencies = [
 "hyper",
 "itertools",
 "metrics",
- "nix 0.26.2",
+ "nix",
 "num-traits",
 "num_cpus",
 "once_cell",
@@ -2653,7 +2596,6 @@ dependencies = [
 "serde_json",
 "serde_with",
 "signal-hook",
- "smallvec",
 "storage_broker",
 "strum",
 "strum_macros",
@@ -2831,7 +2773,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.16",
 ]

 [[package]]
@@ -3028,7 +2970,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
 dependencies = [
 "proc-macro2",
- "syn 2.0.28",
+ "syn 2.0.16",
 ]

 [[package]]
@@ -3039,9 +2981,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"

 [[package]]
 name = "proc-macro2"
-version = "1.0.66"
+version = "1.0.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
+checksum = "78803b62cbf1f46fde80d7c0e803111524b9877184cfe7c3033659490ac7a7da"
 dependencies = [
 "unicode-ident",
 ]
@@ -3203,9 +3145,9 @@ dependencies = [

 [[package]]
 name = "quote"
-version = "1.0.32"
+version = "1.0.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965"
+checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500"
 dependencies = [
 "proc-macro2",
 ]
@@ -3856,22 +3798,22 @@ dependencies = [

 [[package]]
 name = "serde"
-version = "1.0.183"
+version = "1.0.163"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c"
+checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2"
 dependencies = [
 "serde_derive",
 ]

 [[package]]
 name = "serde_derive"
-version = "1.0.183"
+version = "1.0.163"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
+checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.16",
 ]

 [[package]]
@@ -3885,16 +3827,6 @@ dependencies = [
 "serde",
 ]

-[[package]]
-name = "serde_path_to_error"
-version = "0.1.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4beec8bce849d58d06238cb50db2e1c417cfeafa4c63f692b15c82b7c80f8335"
-dependencies = [
- "itoa",
- "serde",
-]
-
 [[package]]
 name = "serde_spanned"
 version = "0.6.2"
@@ -3941,7 +3873,7 @@ dependencies = [
 "darling",
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.16",
 ]

 [[package]]
@@ -4040,9 +3972,9 @@ dependencies = [

 [[package]]
 name = "smallvec"
-version = "1.11.0"
+version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"
+checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"

 [[package]]
 name = "socket2"
@@ -4179,9 +4111,9 @@ dependencies = [

 [[package]]
 name = "syn"
-version = "2.0.28"
+version = "2.0.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567"
+checksum = "a6f671d4b5ffdb8eadec19c0ae67fe2639df8684bd7bc4b83d986b8db549cf01"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -4206,21 +4138,6 @@ dependencies = [
 "unicode-xid",
 ]

-[[package]]
-name = "sysinfo"
-version = "0.29.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "165d6d8539689e3d3bc8b98ac59541e1f21c7de7c85d60dc80e43ae0ed2113db"
-dependencies = [
- "cfg-if",
- "core-foundation-sys",
- "libc",
- "ntapi",
- "once_cell",
- "rayon",
- "winapi",
-]
-
 [[package]]
 name = "tar"
 version = "0.4.40"
@@ -4311,7 +4228,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.16",
 ]

 [[package]]
@@ -4426,7 +4343,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.16",
 ]

 [[package]]
@@ -4532,19 +4449,7 @@ dependencies = [
 "futures-util",
 "log",
 "tokio",
- "tungstenite 0.18.0",
-]
-
-[[package]]
-name = "tokio-tungstenite"
-version = "0.20.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2b2dbec703c26b00d74844519606ef15d09a7d6857860f84ad223dec002ddea2"
-dependencies = [
- "futures-util",
- "log",
- "tokio",
- "tungstenite 0.20.0",
+ "tungstenite",
 ]

 [[package]]
@@ -4736,7 +4641,7 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.16",
 ]

 [[package]]
@@ -4865,25 +4770,6 @@ dependencies = [
 "utf-8",
 ]

-[[package]]
-name = "tungstenite"
-version = "0.20.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e862a1c4128df0112ab625f55cd5c934bcb4312ba80b39ae4b4835a3fd58e649"
-dependencies = [
- "byteorder",
- "bytes",
- "data-encoding",
- "http",
- "httparse",
- "log",
- "rand",
- "sha1",
- "thiserror",
- "url",
- "utf-8",
-]
-
 [[package]]
 name = "typenum"
 version = "1.16.0"
@@ -5011,7 +4897,7 @@ dependencies = [
 "hyper",
 "jsonwebtoken",
 "metrics",
- "nix 0.26.2",
+ "nix",
 "once_cell",
 "pin-project-lite",
 "pq_proto",
@@ -5029,7 +4915,6 @@ dependencies = [
 "thiserror",
 "tokio",
 "tokio-stream",
- "tokio-util",
 "tracing",
 "tracing-error",
 "tracing-subscriber",
@@ -5066,28 +4951,6 @@ version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"

-[[package]]
-name = "vm_monitor"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "axum",
- "cgroups-rs",
- "clap",
- "futures",
- "inotify 0.10.2",
- "serde",
- "serde_json",
- "sysinfo",
- "tokio",
- "tokio-postgres",
- "tokio-stream",
- "tokio-util",
- "tracing",
- "tracing-subscriber",
- "workspace_hack",
-]
-
 [[package]]
 name = "vsimd"
 version = "0.8.0"
@@ -5158,7 +5021,7 @@ dependencies = [
 "once_cell",
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.16",
 "wasm-bindgen-shared",
 ]

@@ -5192,7 +5055,7 @@ checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.16",
 "wasm-bindgen-backend",
 "wasm-bindgen-shared",
 ]
@@ -5477,14 +5340,12 @@ name = "workspace_hack"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "axum",
 "bytes",
 "cc",
 "chrono",
 "clap",
 "clap_builder",
 "crossbeam-utils",
- "digest",
 "either",
 "fail",
 "futures",
@@ -5493,7 +5354,6 @@ dependencies = [
 "futures-executor",
 "futures-sink",
 "futures-util",
- "hyper",
 "itertools",
 "libc",
 "log",
@@ -5512,10 +5372,9 @@ dependencies = [
 "scopeguard",
 "serde",
 "serde_json",
- "smallvec",
 "socket2 0.4.9",
 "syn 1.0.109",
- "syn 2.0.28",
+ "syn 2.0.16",
 "tokio",
 "tokio-rustls 0.23.4",
 "tokio-util",
@@ -5524,6 +5383,7 @@ dependencies = [
 "tower",
 "tracing",
 "tracing-core",
+ "tracing-subscriber",
 "url",
 ]

--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,7 +23,6 @@ members = [
    "libs/remote_storage",
    "libs/tracing-utils",
    "libs/postgres_ffi/wal_craft",
-    "libs/vm_monitor",
 ]

 [workspace.package]
@@ -42,14 +41,12 @@ aws-sdk-s3 = "0.27"
 aws-smithy-http = "0.55"
 aws-credential-types = "0.55"
 aws-types = "0.55"
-axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
 bindgen = "0.65"
 bstr = "1.0"
 byteorder = "1.4"
 bytes = "1.0"
-cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["derive"] }
 close_fds = "0.3.2"
@@ -77,7 +74,6 @@ humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
 hyper-tungstenite = "0.9"
-inotify = "0.10.2"
 itertools = "0.10"
 jsonwebtoken = "8"
 libc = "0.2"
@@ -109,14 +105,12 @@ rustls = "0.20"
 rustls-pemfile = "1"
 rustls-split = "0.3"
 scopeguard = "1.1"
-sysinfo = "0.29.2"
 sentry = { version = "0.30", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_with = "2.0"
 sha2 = "0.10.2"
 signal-hook = "0.3"
-smallvec = "1.11"
 socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
@@ -139,7 +133,7 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.19.0"
-tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
+tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] }
 url = "2.2"
 uuid = { version = "1.2", features = ["v4", "serde"] }
 walkdir = "2.3.2"
@@ -175,7 +169,6 @@ storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main br
 tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
-vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }

 ## Common library dependency
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -50,8 +50,6 @@ RUN cd postgres && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control

-RUN for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql ; do echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO PUBLIC;' >> $file ; done
-
 #########################################################################################
 #
 # Layer "postgis-build"
@@ -213,8 +211,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.0.tar.gz -O pgvector.tar.gz && \
-    echo "d8aa3504b215467ca528525a6de12c3f85f9891b091ce0e5864dd8a9b757f77b pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.4.tar.gz -O pgvector.tar.gz && \
+    echo "1cb70a63f8928e396474796c22a20be9f7285a8a013009deb8152445b61b72e6 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -766,6 +764,29 @@ RUN rm -r /usr/local/pgsql/include
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a

+#########################################################################################
+#
+# Extenstion only
+#
+#########################################################################################
+FROM python:3.9-slim-bullseye AS generate-ext-index
+ARG PG_VERSION
+ARG BUILD_TAG
+RUN apt update && apt install -y zstd
+
+# copy the control files here
+COPY --from=kq-imcx-pg-build /extensions/ /extensions/
+COPY --from=pg-anon-pg-build /extensions/ /extensions/
+COPY --from=postgis-build /extensions/ /extensions/
+COPY scripts/combine_control_files.py ./combine_control_files.py
+RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG} --public_extensions="anon,postgis"
+
+FROM scratch AS postgres-extensions
+# After the transition this layer will include all extensitons.
+# As for now, it's only a couple for testing purposses
+COPY --from=generate-ext-index /extensions/*.tar.zst /extensions/
+COPY --from=generate-ext-index /ext_index.json /ext_index.json
+
 #########################################################################################
 #
 # Final layer
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -8,7 +8,6 @@ license.workspace = true
 anyhow.workspace = true
 async-compression.workspace = true
 chrono.workspace = true
-cfg-if.workspace = true
 clap.workspace = true
 flate2.workspace = true
 futures.workspace = true
@@ -24,7 +23,6 @@ tar.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
-tokio-util.workspace = true
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
@@ -36,5 +34,4 @@ utils.workspace = true
 workspace_hack.workspace = true
 toml_edit.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
-vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.12.4"
--- a/compute_tools/README.md
+++ b/compute_tools/README.md
@@ -19,10 +19,9 @@ Also `compute_ctl` spawns two separate service threads:
 - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
  last activity requests.

-If `AUTOSCALING` environment variable is set, `compute_ctl` will start the
-`vm-monitor` located in [`neon/libs/vm_monitor`]. For VM compute nodes,
-`vm-monitor` communicates with the VM autoscaling system. It coordinates
-downscaling and requests immediate upscaling under resource pressure.
+If the `vm-informant` binary is present at `/bin/vm-informant`, it will also be started. For VM
+compute nodes, `vm-informant` communicates with the VM autoscaling system. It coordinates
+downscaling and (eventually) will request immediate upscaling under resource pressure.

 Usage example:
 ```sh
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -20,10 +20,9 @@
 //! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
 //!   last activity requests.
 //!
-//! If `AUTOSCALING` environment variable is set, `compute_ctl` will start the
-//! `vm-monitor` located in [`neon/libs/vm_monitor`]. For VM compute nodes,
-//! `vm-monitor` communicates with the VM autoscaling system. It coordinates
-//! downscaling and requests immediate upscaling under resource pressure.
+//! If the `vm-informant` binary is present at `/bin/vm-informant`, it will also be started. For VM
+//! compute nodes, `vm-informant` communicates with the VM autoscaling system. It coordinates
+//! downscaling and (eventually) will request immediate upscaling under resource pressure.
 //!
 //! Usage example:
 //! ```sh
@@ -36,6 +35,7 @@
 //!
 use std::collections::HashMap;
 use std::fs::File;
+use std::panic;
 use std::path::Path;
 use std::process::exit;
 use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
@@ -271,55 +271,6 @@ fn main() -> Result<()> {
        }
    };

-    // Start the vm-monitor if directed to. The vm-monitor only runs on linux
-    // because it requires cgroups.
-    cfg_if::cfg_if! {
-        if #[cfg(target_os = "linux")] {
-            use std::env;
-            use tokio_util::sync::CancellationToken;
-            use tracing::warn;
-            let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
-            let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
-            let cgroup = matches.get_one::<String>("cgroup");
-
-            // Only make a runtime if we need to.
-            // Note: it seems like you can make a runtime in an inner scope and
-            // if you start a task in it it won't be dropped. However, make it
-            // in the outermost scope just to be safe.
-            let rt = match (env::var_os("AUTOSCALING"), vm_monitor_addr) {
-                (None, None) => None,
-                (None, Some(_)) => {
-                    warn!("--vm-monitor-addr option set but AUTOSCALING env var not present");
-                    None
-                }
-                (Some(_), None) => {
-                    panic!("AUTOSCALING env var present but --vm-monitor-addr option not set")
-                }
-                (Some(_), Some(_)) => Some(
-                    tokio::runtime::Builder::new_multi_thread()
-                        .worker_threads(4)
-                        .enable_all()
-                        .build()
-                        .expect("failed to create tokio runtime for monitor"),
-                ),
-            };
-
-            // This token is used internally by the monitor to clean up all threads
-            let token = CancellationToken::new();
-
-            let vm_monitor = &rt.as_ref().map(|rt| {
-                rt.spawn(vm_monitor::start(
-                    Box::leak(Box::new(vm_monitor::Args {
-                        cgroup: cgroup.cloned(),
-                        pgconnstr: file_cache_connstr.cloned(),
-                        addr: vm_monitor_addr.cloned().unwrap(),
-                    })),
-                    token.clone(),
-                ))
-            });
-        }
-    }
-
    // Wait for the child Postgres process forever. In this state Ctrl+C will
    // propagate to Postgres and it will be shut down as well.
    if let Some(mut pg) = pg {
@@ -333,24 +284,6 @@ fn main() -> Result<()> {
        exit_code = ecode.code()
    }

-    // Terminate the vm_monitor so it releases the file watcher on
-    // /sys/fs/cgroup/neon-postgres.
-    // Note: the vm-monitor only runs on linux because it requires cgroups.
-    cfg_if::cfg_if! {
-        if #[cfg(target_os = "linux")] {
-            if let Some(handle) = vm_monitor {
-                // Kills all threads spawned by the monitor
-                token.cancel();
-                // Kills the actual task running the monitor
-                handle.abort();
-
-                // If handle is some, rt must have been used to produce it, and
-                // hence is also some
-                rt.unwrap().shutdown_timeout(Duration::from_secs(2));
-            }
-        }
-    }
-
    // Maybe sync safekeepers again, to speed up next startup
    let compute_state = compute.state.lock().unwrap().clone();
    let pspec = compute_state.pspec.as_ref().expect("spec must be set");
@@ -460,29 +393,6 @@ fn cli() -> clap::Command {
                .long("remote-ext-config")
                .value_name("REMOTE_EXT_CONFIG"),
        )
-        // TODO(fprasx): we currently have default arguments because the cloud PR
-        // to pass them in hasn't been merged yet. We should get rid of them once
-        // the PR is merged.
-        .arg(
-            Arg::new("vm-monitor-addr")
-                .long("vm-monitor-addr")
-                .default_value("0.0.0.0:10301")
-                .value_name("VM_MONITOR_ADDR"),
-        )
-        .arg(
-            Arg::new("cgroup")
-                .long("cgroup")
-                .default_value("neon-postgres")
-                .value_name("CGROUP"),
-        )
-        .arg(
-            Arg::new("filecache-connstr")
-                .long("filecache-connstr")
-                .default_value(
-                    "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable",
-                )
-                .value_name("FILECACHE_CONNSTR"),
-        )
 }

 #[test]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,5 +1,4 @@
 use std::collections::HashMap;
-use std::env;
 use std::fs;
 use std::io::BufRead;
 use std::os::unix::fs::PermissionsExt;
@@ -176,27 +175,6 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
    }
 }

-/// If we are a VM, returns a [`Command`] that will run in the `neon-postgres`
-/// cgroup. Otherwise returns the default `Command::new(cmd)`
-///
-/// This function should be used to start postgres, as it will start it in the
-/// neon-postgres cgroup if we are a VM. This allows autoscaling to control
-/// postgres' resource usage. The cgroup will exist in VMs because vm-builder
-/// creates it during the sysinit phase of its inittab.
-fn maybe_cgexec(cmd: &str) -> Command {
-    // The cplane sets this env var for autoscaling computes.
-    // use `var_os` so we don't have to worry about the variable being valid
-    // unicode. Should never be an concern . . . but just in case
-    if env::var_os("AUTOSCALING").is_some() {
-        let mut command = Command::new("cgexec");
-        command.args(["-g", "memory:neon-postgres"]);
-        command.arg(cmd);
-        command
-    } else {
-        Command::new(cmd)
-    }
-}
-
 /// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
 /// that we give to customers
 fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
@@ -473,7 +451,7 @@ impl ComputeNode {
    pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
        let start_time = Utc::now();

-        let sync_handle = maybe_cgexec(&self.pgbin)
+        let sync_handle = Command::new(&self.pgbin)
            .args(["--sync-safekeepers"])
            .env("PGDATA", &self.pgdata) // we cannot use -D in this mode
            .envs(if let Some(storage_auth_token) = &storage_auth_token {
@@ -608,7 +586,7 @@ impl ComputeNode {

        // Start postgres
        info!("starting postgres");
-        let mut pg = maybe_cgexec(&self.pgbin)
+        let mut pg = Command::new(&self.pgbin)
            .args(["-D", pgdata])
            .spawn()
            .expect("cannot start postgres process");
@@ -636,7 +614,7 @@ impl ComputeNode {
        let pgdata_path = Path::new(&self.pgdata);

        // Run postgres as a child process.
-        let mut pg = maybe_cgexec(&self.pgbin)
+        let mut pg = Command::new(&self.pgbin)
            .args(["-D", &self.pgdata])
            .envs(if let Some(storage_auth_token) = &storage_auth_token {
                vec![("NEON_AUTH_TOKEN", storage_auth_token)]
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -108,10 +108,12 @@ pub fn get_pg_version(pgbin: &str) -> String {
    // pg_config --version returns a (platform specific) human readable string
    // such as "PostgreSQL 15.4". We parse this to v14/v15
    let human_version = get_pg_config("--version", pgbin);
-    if human_version.contains("15") {
-        return "v15".to_string();
-    } else if human_version.contains("14") {
+    if human_version.contains("14") {
        return "v14".to_string();
+    } else if human_version.contains("15") {
+        return "v15".to_string();
+    } else if human_version.contains("16") {
+        return "v16".to_string();
    }
    panic!("Unsuported postgres version {human_version}");
 }
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -26,7 +26,6 @@ serde_json.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
-tokio-util.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
--- a/libs/utils/src/backoff.rs
+++ b/libs/utils/src/backoff.rs
@@ -1,31 +1,18 @@
 use std::fmt::{Debug, Display};

 use futures::Future;
-use tokio_util::sync::CancellationToken;

 pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
 pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;

-pub async fn exponential_backoff(
-    n: u32,
-    base_increment: f64,
-    max_seconds: f64,
-    cancel: &CancellationToken,
-) {
+pub async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
    let backoff_duration_seconds =
        exponential_backoff_duration_seconds(n, base_increment, max_seconds);
    if backoff_duration_seconds > 0.0 {
        tracing::info!(
            "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
        );
-
-        drop(
-            tokio::time::timeout(
-                std::time::Duration::from_secs_f64(backoff_duration_seconds),
-                cancel.cancelled(),
-            )
-            .await,
-        )
+        tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
    }
 }

@@ -37,57 +24,28 @@ pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_sec
    }
 }

-/// Configure cancellation for a retried operation: when to cancel (the token), and
-/// what kind of error to return on cancellation
-pub struct Cancel<E, CF>
-where
-    E: Display + Debug + 'static,
-    CF: Fn() -> E,
-{
-    token: CancellationToken,
-    on_cancel: CF,
-}
-
-impl<E, CF> Cancel<E, CF>
-where
-    E: Display + Debug + 'static,
-    CF: Fn() -> E,
-{
-    pub fn new(token: CancellationToken, on_cancel: CF) -> Self {
-        Self { token, on_cancel }
-    }
-}
-
 /// retries passed operation until one of the following conditions are met:
 /// Encountered error is considered as permanent (non-retryable)
 /// Retries have been exhausted.
 /// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
 /// When attempts cross `warn_threshold` function starts to emit log warnings.
 /// `description` argument is added to log messages. Its value should identify the `op` is doing
-/// `cancel` argument is required: any time we are looping on retry, we should be using a CancellationToken
-/// to drop out promptly on shutdown.
-pub async fn retry<T, O, F, E, CF>(
+pub async fn retry<T, O, F, E>(
    mut op: O,
    is_permanent: impl Fn(&E) -> bool,
    warn_threshold: u32,
    max_retries: u32,
    description: &str,
-    cancel: Cancel<E, CF>,
 ) -> Result<T, E>
 where
    // Not std::error::Error because anyhow::Error doesnt implement it.
    // For context see https://github.com/dtolnay/anyhow/issues/63
-    E: Display + Debug + 'static,
+    E: Display + Debug,
    O: FnMut() -> F,
    F: Future<Output = Result<T, E>>,
-    CF: Fn() -> E,
 {
    let mut attempts = 0;
    loop {
-        if cancel.token.is_cancelled() {
-            return Err((cancel.on_cancel)());
-        }
-
        let result = op().await;
        match result {
            Ok(_) => {
@@ -122,7 +80,6 @@ where
            attempts,
            DEFAULT_BASE_BACKOFF_SECONDS,
            DEFAULT_MAX_BACKOFF_SECONDS,
-            &cancel.token,
        )
        .await;
        attempts += 1;
@@ -175,7 +132,6 @@ mod tests {
            1,
            1,
            "work",
-            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
        )
        .await;

@@ -201,7 +157,6 @@ mod tests {
            2,
            2,
            "work",
-            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
        )
        .await
        .unwrap();
@@ -224,7 +179,6 @@ mod tests {
            2,
            2,
            "work",
-            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
        )
        .await
        .unwrap_err();
--- a/libs/vm_monitor/Cargo.toml
+++ b/libs/vm_monitor/Cargo.toml
@@ -1,31 +0,0 @@
-[package]
-name = "vm_monitor"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-
-[[bin]]
-name = "vm-monitor"
-path = "./src/bin/monitor.rs"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[dependencies]
-anyhow.workspace = true
-axum.workspace = true
-clap.workspace = true
-futures.workspace = true
-inotify.workspace = true
-serde.workspace = true
-serde_json.workspace = true
-sysinfo.workspace = true
-tokio.workspace = true
-tokio-postgres.workspace = true
-tokio-stream.workspace = true
-tokio-util.workspace = true
-tracing.workspace = true
-tracing-subscriber.workspace = true
-workspace_hack = { version = "0.1", path = "../../workspace_hack" }
-
-[target.'cfg(target_os = "linux")'.dependencies]
-cgroups-rs = "0.3.3"
--- a/libs/vm_monitor/README.md
+++ b/libs/vm_monitor/README.md
@@ -1,34 +0,0 @@
-# `vm-monitor`
-
-The `vm-monitor` (or just monitor) is a core component of the autoscaling system,
-along with the `autoscale-scheduler` and the `autoscaler-agent`s. The monitor has
-two primary roles: 1) notifying agents when immediate upscaling is necessary due
-to memory conditions and 2) managing Postgres' file cache and a cgroup to carry
-out upscaling and downscaling decisions.
-
-## More on scaling
-
-We scale CPU and memory using NeonVM, our in-house QEMU tool for use with Kubernetes.
-To control thresholds for receiving memory usage notifications, we start Postgres
-in the `neon-postgres` cgroup and set its `memory.{max,high}`.
-
-* See also: [`neondatabase/autoscaling`](https://github.com/neondatabase/autoscaling/)
-* See also: [`neondatabase/vm-monitor`](https://github.com/neondatabase/vm-monitor/),
-where initial development of the monitor happened. The repository is no longer
-maintained but the commit history may be useful for debugging.
-
-## Structure
-
-The `vm-monitor` is loosely comprised of a few systems. These are:
-* the server: this is just a simple `axum` server that accepts requests and
-upgrades them to websocket connections. The server only allows one connection at
-a time. This means that upon receiving a new connection, the server will terminate
-and old one if it exists.
-* the filecache: a struct that allows communication with the Postgres file cache.
-On startup, we connect to the filecache and hold on to the connection for the
-entire monitor lifetime.
-* the cgroup watcher: the `CgroupWatcher` manages the `neon-postgres` cgroup by
-listening for `memory.high` events and setting its `memory.{high,max}` values.
-* the runner: the runner marries the filecache and cgroup watcher together,
-communicating with the agent throught the `Dispatcher`, and then calling filecache
-and cgroup watcher functions as needed to upscale and downscale
--- a/libs/vm_monitor/src/bin/monitor.rs
+++ b/libs/vm_monitor/src/bin/monitor.rs
@@ -1,33 +0,0 @@
-// We expose a standalone binary _and_ start the monitor in `compute_ctl` so that
-// we can test the monitor as part of the entire autoscaling system in
-// neondatabase/autoscaling.
-//
-// The monitor was previously started by vm-builder, and for testing purposes,
-// we can mimic that setup with this binary.
-
-#[cfg(target_os = "linux")]
-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
-    use clap::Parser;
-    use tokio_util::sync::CancellationToken;
-    use tracing_subscriber::EnvFilter;
-    use vm_monitor::Args;
-
-    let subscriber = tracing_subscriber::fmt::Subscriber::builder()
-        .json()
-        .with_file(true)
-        .with_line_number(true)
-        .with_span_list(true)
-        .with_env_filter(EnvFilter::from_default_env())
-        .finish();
-    tracing::subscriber::set_global_default(subscriber)?;
-
-    let args: &'static Args = Box::leak(Box::new(Args::parse()));
-    let token = CancellationToken::new();
-    vm_monitor::start(args, token).await
-}
-
-#[cfg(not(target_os = "linux"))]
-fn main() {
-    panic!("the monitor requires cgroups, which are only available on linux")
-}
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -1,693 +0,0 @@
-use std::{
-    fmt::{Debug, Display},
-    fs,
-    pin::pin,
-    sync::atomic::{AtomicU64, Ordering},
-};
-
-use anyhow::{anyhow, bail, Context};
-use cgroups_rs::{
-    freezer::FreezerController,
-    hierarchies::{self, is_cgroup2_unified_mode, UNIFIED_MOUNTPOINT},
-    memory::MemController,
-    MaxValue,
-    Subsystem::{Freezer, Mem},
-};
-use inotify::{EventStream, Inotify, WatchMask};
-use tokio::sync::mpsc::{self, error::TryRecvError};
-use tokio::time::{Duration, Instant};
-use tokio_stream::{Stream, StreamExt};
-use tracing::{info, warn};
-
-use crate::protocol::Resources;
-use crate::MiB;
-
-/// Monotonically increasing counter of the number of memory.high events
-/// the cgroup has experienced.
-///
-/// We use this to determine if a modification to the `memory.events` file actually
-/// changed the `high` field. If not, we don't care about the change. When we
-/// read the file, we check the `high` field in the file against `MEMORY_EVENT_COUNT`
-/// to see if it changed since last time.
-pub static MEMORY_EVENT_COUNT: AtomicU64 = AtomicU64::new(0);
-
-/// Monotonically increasing counter that gives each cgroup event a unique id.
-///
-/// This allows us to answer questions like "did this upscale arrive before this
-/// memory.high?". This static is also used by the `Sequenced` type to "tag" values
-/// with a sequence number. As such, prefer to used the `Sequenced` type rather
-/// than this static directly.
-static EVENT_SEQUENCE_NUMBER: AtomicU64 = AtomicU64::new(0);
-
-/// A memory event type reported in memory.events.
-#[derive(Debug, Eq, PartialEq, Copy, Clone)]
-pub enum MemoryEvent {
-    Low,
-    High,
-    Max,
-    Oom,
-    OomKill,
-    OomGroupKill,
-}
-
-impl MemoryEvent {
-    fn as_str(&self) -> &str {
-        match self {
-            MemoryEvent::Low => "low",
-            MemoryEvent::High => "high",
-            MemoryEvent::Max => "max",
-            MemoryEvent::Oom => "oom",
-            MemoryEvent::OomKill => "oom_kill",
-            MemoryEvent::OomGroupKill => "oom_group_kill",
-        }
-    }
-}
-
-impl Display for MemoryEvent {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.write_str(self.as_str())
-    }
-}
-
-/// Configuration for a `CgroupWatcher`
-#[derive(Debug, Clone)]
-pub struct Config {
-    // The target difference between the total memory reserved for the cgroup
-    // and the value of the cgroup's memory.high.
-    //
-    // In other words, memory.high + oom_buffer_bytes will equal the total memory that the cgroup may
-    // use (equal to system memory, minus whatever's taken out for the file cache).
-    oom_buffer_bytes: u64,
-
-    // The amount of memory, in bytes, below a proposed new value for
-    // memory.high that the cgroup's memory usage must be for us to downscale
-    //
-    // In other words, we can downscale only when:
-    //
-    //   memory.current + memory_high_buffer_bytes < (proposed) memory.high
-    //
-    // TODO: there's some minor issues with this approach -- in particular, that we might have
-    // memory in use by the kernel's page cache that we're actually ok with getting rid of.
-    pub(crate) memory_high_buffer_bytes: u64,
-
-    // The maximum duration, in milliseconds, that we're allowed to pause
-    // the cgroup for while waiting for the autoscaler-agent to upscale us
-    max_upscale_wait: Duration,
-
-    // The required minimum time, in milliseconds, that we must wait before re-freezing
-    // the cgroup while waiting for the autoscaler-agent to upscale us.
-    do_not_freeze_more_often_than: Duration,
-
-    // The amount of memory, in bytes, that we should periodically increase memory.high
-    // by while waiting for the autoscaler-agent to upscale us.
-    //
-    // This exists to avoid the excessive throttling that happens when a cgroup is above its
-    // memory.high for too long. See more here:
-    // https://github.com/neondatabase/autoscaling/issues/44#issuecomment-1522487217
-    memory_high_increase_by_bytes: u64,
-
-    // The period, in milliseconds, at which we should repeatedly increase the value
-    // of the cgroup's memory.high while we're waiting on upscaling and memory.high
-    // is still being hit.
-    //
-    // Technically speaking, this actually serves as a rate limit to moderate responding to
-    // memory.high events, but these are roughly equivalent if the process is still allocating
-    // memory.
-    memory_high_increase_every: Duration,
-}
-
-impl Config {
-    /// Calculate the new value for the cgroups memory.high based on system memory
-    pub fn calculate_memory_high_value(&self, total_system_mem: u64) -> u64 {
-        total_system_mem.saturating_sub(self.oom_buffer_bytes)
-    }
-}
-
-impl Default for Config {
-    fn default() -> Self {
-        Self {
-            oom_buffer_bytes: 100 * MiB,
-            memory_high_buffer_bytes: 100 * MiB,
-            // while waiting for upscale, don't freeze for more than 20ms every 1s
-            max_upscale_wait: Duration::from_millis(20),
-            do_not_freeze_more_often_than: Duration::from_millis(1000),
-            // while waiting for upscale, increase memory.high by 10MiB every 25ms
-            memory_high_increase_by_bytes: 10 * MiB,
-            memory_high_increase_every: Duration::from_millis(25),
-        }
-    }
-}
-
-/// Used to represent data that is associated with a certain point in time, such
-/// as an upscale request or memory.high event.
-///
-/// Internally, creating a `Sequenced` uses a static atomic counter to obtain
-/// a unique sequence number. Sequence numbers are monotonically increasing,
-/// allowing us to answer questions like "did this upscale happen after this
-/// memory.high event?" by comparing the sequence numbers of the two events.
-#[derive(Debug, Clone)]
-pub struct Sequenced<T> {
-    seqnum: u64,
-    data: T,
-}
-
-impl<T> Sequenced<T> {
-    pub fn new(data: T) -> Self {
-        Self {
-            seqnum: EVENT_SEQUENCE_NUMBER.fetch_add(1, Ordering::AcqRel),
-            data,
-        }
-    }
-}
-
-/// Responds to `MonitorEvents` to manage the cgroup: preventing it from being
-/// OOM killed or throttling.
-///
-/// The `CgroupWatcher` primarily achieves this by reading from a stream of
-/// `MonitorEvent`s. See `main_signals_loop` for details on how to keep the
-/// cgroup happy.
-#[derive(Debug)]
-pub struct CgroupWatcher {
-    pub config: Config,
-
-    /// The sequence number of the last upscale.
-    ///
-    /// If we receive a memory.high event that has a _lower_ sequence number than
-    /// `last_upscale_seqnum`, then we know it occured before the upscale, and we
-    /// can safely ignore it.
-    ///
-    /// Note: Like the `events` field, this doesn't _need_ interior mutability but we
-    /// use it anyways so that methods take `&self`, not `&mut self`.
-    last_upscale_seqnum: AtomicU64,
-
-    /// A channel on which we send messages to request upscale from the dispatcher.
-    upscale_requester: mpsc::Sender<()>,
-
-    /// The actual cgroup we are watching and managing.
-    cgroup: cgroups_rs::Cgroup,
-}
-
-/// Read memory.events for the desired event type.
-///
-/// `path` specifies the path to the desired `memory.events` file.
-/// For more info, see the `memory.events` section of the [kernel docs]
-/// <https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files>
-fn get_event_count(path: &str, event: MemoryEvent) -> anyhow::Result<u64> {
-    let contents = fs::read_to_string(path)
-        .with_context(|| format!("failed to read memory.events from {path}"))?;
-
-    // Then contents of the file look like:
-    // low 42
-    // high 101
-    // ...
-    contents
-        .lines()
-        .filter_map(|s| s.split_once(' '))
-        .find(|(e, _)| *e == event.as_str())
-        .ok_or_else(|| anyhow!("failed to find entry for memory.{event} events in {path}"))
-        .and_then(|(_, count)| {
-            count
-                .parse::<u64>()
-                .with_context(|| format!("failed to parse memory.{event} as u64"))
-        })
-}
-
-/// Create an event stream that produces events whenever the file at the provided
-/// path is modified.
-fn create_file_watcher(path: &str) -> anyhow::Result<EventStream<[u8; 1024]>> {
-    info!("creating file watcher for {path}");
-    let inotify = Inotify::init().context("failed to initialize file watcher")?;
-    inotify
-        .watches()
-        .add(path, WatchMask::MODIFY)
-        .with_context(|| format!("failed to start watching {path}"))?;
-    inotify
-        // The inotify docs use [0u8; 1024] so we'll just copy them. We only need
-        // to store one event at a time - if the event gets written over, that's
-        // ok. We still see that there is an event. For more information, see:
-        // https://man7.org/linux/man-pages/man7/inotify.7.html
-        .into_event_stream([0u8; 1024])
-        .context("failed to start inotify event stream")
-}
-
-impl CgroupWatcher {
-    /// Create a new `CgroupWatcher`.
-    #[tracing::instrument(skip_all, fields(%name))]
-    pub fn new(
-        name: String,
-        // A channel on which to send upscale requests
-        upscale_requester: mpsc::Sender<()>,
-    ) -> anyhow::Result<(Self, impl Stream<Item = Sequenced<u64>>)> {
-        // TODO: clarify exactly why we need v2
-        // Make sure cgroups v2 (aka unified) are supported
-        if !is_cgroup2_unified_mode() {
-            anyhow::bail!("cgroups v2 not supported");
-        }
-        let cgroup = cgroups_rs::Cgroup::load(hierarchies::auto(), &name);
-
-        // Start monitoring the cgroup for memory events. In general, for
-        // cgroups v2 (aka unified), metrics are reported in files like
-        // > `/sys/fs/cgroup/{name}/{metric}`
-        // We are looking for `memory.high` events, which are stored in the
-        // file `memory.events`. For more info, see the `memory.events` section
-        // of https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files
-        let path = format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name);
-        let memory_events = create_file_watcher(&path)
-            .with_context(|| format!("failed to create event watcher for {path}"))?
-            // This would be nice with with .inspect_err followed by .ok
-            .filter_map(move |_| match get_event_count(&path, MemoryEvent::High) {
-                Ok(high) => Some(high),
-                Err(error) => {
-                    // TODO: Might want to just panic here
-                    warn!(?error, "failed to read high events count from {}", &path);
-                    None
-                }
-            })
-            // Only report the event if the memory.high count increased
-            .filter_map(|high| {
-                if MEMORY_EVENT_COUNT.fetch_max(high, Ordering::AcqRel) < high {
-                    Some(high)
-                } else {
-                    None
-                }
-            })
-            .map(Sequenced::new);
-
-        let initial_count = get_event_count(
-            &format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name),
-            MemoryEvent::High,
-        )?;
-
-        info!(initial_count, "initial memory.high event count");
-
-        // Hard update `MEMORY_EVENT_COUNT` since there could have been processes
-        // running in the cgroup before that caused it to be non-zero.
-        MEMORY_EVENT_COUNT.fetch_max(initial_count, Ordering::AcqRel);
-
-        Ok((
-            Self {
-                cgroup,
-                upscale_requester,
-                last_upscale_seqnum: AtomicU64::new(0),
-                config: Default::default(),
-            },
-            memory_events,
-        ))
-    }
-
-    /// The entrypoint for the `CgroupWatcher`.
-    #[tracing::instrument(skip_all)]
-    pub async fn watch<E>(
-        &self,
-        // These are ~dependency injected~ (fancy, I know) because this function
-        // should never return.
-        // -> therefore: when we tokio::spawn it, we don't await the JoinHandle.
-        // -> therefore: if we want to stick it in an Arc so many threads can access
-        //    it, methods can never take mutable access.
-        //     - note: we use the Arc strategy so that a) we can call this function
-        //             right here and b) the runner can call the set/get_memory methods
-        // -> since calling recv() on a tokio::sync::mpsc::Receiver takes &mut self,
-        //    we just pass them in here instead of holding them in fields, as that
-        //    would require this method to take &mut self.
-        mut upscales: mpsc::Receiver<Sequenced<Resources>>,
-        events: E,
-    ) -> anyhow::Result<()>
-    where
-        E: Stream<Item = Sequenced<u64>>,
-    {
-        // There are several actions might do when receiving a `memory.high`,
-        // such as freezing the cgroup, or increasing its `memory.high`. We don't
-        // want to do these things too often (because postgres needs to run, and
-        // we only have so much memory). These timers serve as rate limits for this.
-        let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO));
-        let mut wait_to_increase_memory_high = pin!(tokio::time::sleep(Duration::ZERO));
-        let mut events = pin!(events);
-
-        // Are we waiting to be upscaled? Could be true if we request upscale due
-        // to a memory.high event and it does not arrive in time.
-        let mut waiting_on_upscale = false;
-
-        loop {
-            tokio::select! {
-                upscale = upscales.recv() => {
-                    let Sequenced { seqnum, data } = upscale
-                        .context("failed to listen on upscale notification channel")?;
-                    self.last_upscale_seqnum.store(seqnum, Ordering::Release);
-                    info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
-                }
-                event = events.next() => {
-                    let Some(Sequenced { seqnum, .. }) = event else {
-                        bail!("failed to listen for memory.high events")
-                    };
-                    // The memory.high came before our last upscale, so we consider
-                    // it resolved
-                    if self.last_upscale_seqnum.fetch_max(seqnum, Ordering::AcqRel) > seqnum {
-                        info!(
-                            "received memory.high event, but it came before our last upscale -> ignoring it"
-                        );
-                        continue;
-                    }
-
-                    // The memory.high came after our latest upscale. We don't
-                    // want to do anything yet, so peek the next event in hopes
-                    // that it's an upscale.
-                    if let Some(upscale_num) = self
-                        .upscaled(&mut upscales)
-                        .context("failed to check if we were upscaled")?
-                    {
-                        if upscale_num > seqnum {
-                            info!(
-                                "received memory.high event, but it came before our last upscale -> ignoring it"
-                            );
-                            continue;
-                        }
-                    }
-
-                    // If it's been long enough since we last froze, freeze the
-                    // cgroup and request upscale
-                    if wait_to_freeze.is_elapsed() {
-                        info!("received memory.high event -> requesting upscale");
-                        waiting_on_upscale = self
-                            .handle_memory_high_event(&mut upscales)
-                            .await
-                            .context("failed to handle upscale")?;
-                        wait_to_freeze
-                            .as_mut()
-                            .reset(Instant::now() + self.config.do_not_freeze_more_often_than);
-                        continue;
-                    }
-
-                    // Ok, we can't freeze, just request upscale
-                    if !waiting_on_upscale {
-                        info!("received memory.high event, but too soon to refreeze -> requesting upscale");
-
-                        // Make check to make sure we haven't been upscaled in the
-                        // meantine (can happen if the agent independently decides
-                        // to upscale us again)
-                        if self
-                            .upscaled(&mut upscales)
-                            .context("failed to check if we were upscaled")?
-                            .is_some()
-                        {
-                            info!("no need to request upscaling because we got upscaled");
-                            continue;
-                        }
-                        self.upscale_requester
-                            .send(())
-                            .await
-                            .context("failed to request upscale")?;
-                        continue;
-                    }
-
-                    // Shoot, we can't freeze or and we're still waiting on upscale,
-                    // increase memory.high to reduce throttling
-                    if wait_to_increase_memory_high.is_elapsed() {
-                        info!(
-                            "received memory.high event, \
-                            but too soon to refreeze and already requested upscale \
-                            -> increasing memory.high"
-                        );
-
-                        // Make check to make sure we haven't been upscaled in the
-                        // meantine (can happen if the agent independently decides
-                        // to upscale us again)
-                        if self
-                            .upscaled(&mut upscales)
-                            .context("failed to check if we were upscaled")?
-                            .is_some()
-                        {
-                            info!("no need to increase memory.high because got upscaled");
-                            continue;
-                        }
-
-                        // Request upscale anyways (the agent will handle deduplicating
-                        // requests)
-                        self.upscale_requester
-                            .send(())
-                            .await
-                            .context("failed to request upscale")?;
-
-                        let memory_high =
-                            self.get_high_bytes().context("failed to get memory.high")?;
-                        let new_high = memory_high + self.config.memory_high_increase_by_bytes;
-                        info!(
-                            current_high_bytes = memory_high,
-                            new_high_bytes = new_high,
-                            "updating memory.high"
-                        );
-                        self.set_high_bytes(new_high)
-                            .context("failed to set memory.high")?;
-                        wait_to_increase_memory_high
-                            .as_mut()
-                            .reset(Instant::now() + self.config.memory_high_increase_every)
-                    }
-
-                    // we can't do anything
-                }
-            };
-        }
-    }
-
-    /// Handle a `memory.high`, returning whether we are still waiting on upscale
-    /// by the time the function returns.
-    ///
-    /// The general plan for handling a `memory.high` event is as follows:
-    /// 1. Freeze the cgroup
-    /// 2. Start a timer for `self.config.max_upscale_wait`
-    /// 3. Request upscale
-    /// 4. After the timer elapses or we receive upscale, thaw the cgroup.
-    /// 5. Return whether or not we are still waiting for upscale. If we are,
-    ///    we'll increase the cgroups memory.high to avoid getting oom killed
-    #[tracing::instrument(skip_all)]
-    async fn handle_memory_high_event(
-        &self,
-        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
-    ) -> anyhow::Result<bool> {
-        // Immediately freeze the cgroup before doing anything else.
-        info!("received memory.high event -> freezing cgroup");
-        self.freeze().context("failed to freeze cgroup")?;
-
-        // We'll use this for logging durations
-        let start_time = Instant::now();
-
-        // Await the upscale until we have to unfreeze
-        let timed =
-            tokio::time::timeout(self.config.max_upscale_wait, self.await_upscale(upscales));
-
-        // Request the upscale
-        info!(
-            wait = ?self.config.max_upscale_wait,
-            "sending request for immediate upscaling",
-        );
-        self.upscale_requester
-            .send(())
-            .await
-            .context("failed to request upscale")?;
-
-        let waiting_on_upscale = match timed.await {
-            Ok(Ok(())) => {
-                info!(elapsed = ?start_time.elapsed(), "received upscale in time");
-                false
-            }
-            // **important**: unfreeze the cgroup before ?-reporting the error
-            Ok(Err(e)) => {
-                info!("error waiting for upscale -> thawing cgroup");
-                self.thaw()
-                    .context("failed to thaw cgroup after errored waiting for upscale")?;
-                Err(e.context("failed to await upscale"))?
-            }
-            Err(_) => {
-                info!(elapsed = ?self.config.max_upscale_wait, "timed out waiting for upscale");
-                true
-            }
-        };
-
-        info!("thawing cgroup");
-        self.thaw().context("failed to thaw cgroup")?;
-
-        Ok(waiting_on_upscale)
-    }
-
-    /// Checks whether we were just upscaled, returning the upscale's sequence
-    /// number if so.
-    #[tracing::instrument(skip_all)]
-    fn upscaled(
-        &self,
-        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
-    ) -> anyhow::Result<Option<u64>> {
-        let Sequenced { seqnum, data } = match upscales.try_recv() {
-            Ok(upscale) => upscale,
-            Err(TryRecvError::Empty) => return Ok(None),
-            Err(TryRecvError::Disconnected) => {
-                bail!("upscale notification channel was disconnected")
-            }
-        };
-
-        // Make sure to update the last upscale sequence number
-        self.last_upscale_seqnum.store(seqnum, Ordering::Release);
-        info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
-        Ok(Some(seqnum))
-    }
-
-    /// Await an upscale event, discarding any `memory.high` events received in
-    /// the process.
-    ///
-    /// This is used in `handle_memory_high_event`, where we need to listen
-    /// for upscales in particular so we know if we can thaw the cgroup early.
-    #[tracing::instrument(skip_all)]
-    async fn await_upscale(
-        &self,
-        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
-    ) -> anyhow::Result<()> {
-        let Sequenced { seqnum, .. } = upscales
-            .recv()
-            .await
-            .context("error listening for upscales")?;
-
-        self.last_upscale_seqnum.store(seqnum, Ordering::Release);
-        Ok(())
-    }
-
-    /// Get the cgroup's name.
-    pub fn path(&self) -> &str {
-        self.cgroup.path()
-    }
-}
-
-/// Represents a set of limits we apply to a cgroup to control memory usage.
-///
-/// Setting these values also affects the thresholds for receiving usage alerts.
-#[derive(Debug)]
-pub struct MemoryLimits {
-    high: u64,
-    max: u64,
-}
-
-impl MemoryLimits {
-    pub fn new(high: u64, max: u64) -> Self {
-        Self { max, high }
-    }
-}
-
-// Methods for manipulating the actual cgroup
-impl CgroupWatcher {
-    /// Get a handle on the freezer subsystem.
-    fn freezer(&self) -> anyhow::Result<&FreezerController> {
-        if let Some(Freezer(freezer)) = self
-            .cgroup
-            .subsystems()
-            .iter()
-            .find(|sub| matches!(sub, Freezer(_)))
-        {
-            Ok(freezer)
-        } else {
-            anyhow::bail!("could not find freezer subsystem")
-        }
-    }
-
-    /// Attempt to freeze the cgroup.
-    pub fn freeze(&self) -> anyhow::Result<()> {
-        self.freezer()
-            .context("failed to get freezer subsystem")?
-            .freeze()
-            .context("failed to freeze")
-    }
-
-    /// Attempt to thaw the cgroup.
-    pub fn thaw(&self) -> anyhow::Result<()> {
-        self.freezer()
-            .context("failed to get freezer subsystem")?
-            .thaw()
-            .context("failed to thaw")
-    }
-
-    /// Get a handle on the memory subsystem.
-    ///
-    /// Note: this method does not require `self.memory_update_lock` because
-    /// getting a handle to the subsystem does not access any of the files we
-    /// care about, such as memory.high and memory.events
-    fn memory(&self) -> anyhow::Result<&MemController> {
-        if let Some(Mem(memory)) = self
-            .cgroup
-            .subsystems()
-            .iter()
-            .find(|sub| matches!(sub, Mem(_)))
-        {
-            Ok(memory)
-        } else {
-            anyhow::bail!("could not find memory subsystem")
-        }
-    }
-
-    /// Get cgroup current memory usage.
-    pub fn current_memory_usage(&self) -> anyhow::Result<u64> {
-        Ok(self
-            .memory()
-            .context("failed to get memory subsystem")?
-            .memory_stat()
-            .usage_in_bytes)
-    }
-
-    /// Set cgroup memory.high threshold.
-    pub fn set_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
-        self.memory()
-            .context("failed to get memory subsystem")?
-            .set_mem(cgroups_rs::memory::SetMemory {
-                low: None,
-                high: Some(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64)),
-                min: None,
-                max: None,
-            })
-            .context("failed to set memory.high")
-    }
-
-    /// Set cgroup memory.high and memory.max.
-    pub fn set_limits(&self, limits: &MemoryLimits) -> anyhow::Result<()> {
-        info!(
-            limits.high,
-            limits.max,
-            path = self.path(),
-            "writing new memory limits",
-        );
-        self.memory()
-            .context("failed to get memory subsystem while setting memory limits")?
-            .set_mem(cgroups_rs::memory::SetMemory {
-                min: None,
-                low: None,
-                high: Some(MaxValue::Value(
-                    u64::min(limits.high, i64::MAX as u64) as i64
-                )),
-                max: Some(MaxValue::Value(u64::min(limits.max, i64::MAX as u64) as i64)),
-            })
-            .context("failed to set memory limits")
-    }
-
-    /// Given some amount of available memory, set the desired cgroup memory limits
-    pub fn set_memory_limits(&mut self, available_memory: u64) -> anyhow::Result<()> {
-        let new_high = self.config.calculate_memory_high_value(available_memory);
-        let limits = MemoryLimits::new(new_high, available_memory);
-        info!(
-            path = self.path(),
-            memory = ?limits,
-            "setting cgroup memory",
-        );
-        self.set_limits(&limits)
-            .context("failed to set cgroup memory limits")?;
-        Ok(())
-    }
-
-    /// Get memory.high threshold.
-    pub fn get_high_bytes(&self) -> anyhow::Result<u64> {
-        let high = self
-            .memory()
-            .context("failed to get memory subsystem while getting memory statistics")?
-            .get_mem()
-            .map(|mem| mem.high)
-            .context("failed to get memory statistics from subsystem")?;
-        match high {
-            Some(MaxValue::Max) => Ok(i64::MAX as u64),
-            Some(MaxValue::Value(high)) => Ok(high as u64),
-            None => anyhow::bail!("failed to read memory.high from memory subsystem"),
-        }
-    }
-}
--- a/libs/vm_monitor/src/dispatcher.rs
+++ b/libs/vm_monitor/src/dispatcher.rs
@@ -1,153 +0,0 @@
-//! Managing the websocket connection and other signals in the monitor.
-//!
-//! Contains types that manage the interaction (not data interchange, see `protocol`)
-//! between agent and monitor, allowing us to to process and send messages in a
-//! straightforward way. The dispatcher also manages that signals that come from
-//! the cgroup (requesting upscale), and the signals that go to the cgroup
-//! (notifying it of upscale).
-
-use anyhow::{bail, Context};
-use axum::extract::ws::{Message, WebSocket};
-use futures::{
-    stream::{SplitSink, SplitStream},
-    SinkExt, StreamExt,
-};
-use tokio::sync::mpsc;
-use tracing::info;
-
-use crate::cgroup::Sequenced;
-use crate::protocol::{
-    OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, Resources, PROTOCOL_MAX_VERSION,
-    PROTOCOL_MIN_VERSION,
-};
-
-/// The central handler for all communications in the monitor.
-///
-/// The dispatcher has two purposes:
-/// 1. Manage the connection to the agent, sending and receiving messages.
-/// 2. Communicate with the cgroup manager, notifying it when upscale is received,
-///    and sending a message to the agent when the cgroup manager requests
-///    upscale.
-#[derive(Debug)]
-pub struct Dispatcher {
-    /// We read agent messages of of `source`
-    pub(crate) source: SplitStream<WebSocket>,
-
-    /// We send messages to the agent through `sink`
-    sink: SplitSink<WebSocket, Message>,
-
-    /// Used to notify the cgroup when we are upscaled.
-    pub(crate) notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
-
-    /// When the cgroup requests upscale it will send on this channel. In response
-    /// we send an `UpscaleRequst` to the agent.
-    pub(crate) request_upscale_events: mpsc::Receiver<()>,
-
-    /// The protocol version we have agreed to use with the agent. This is negotiated
-    /// during the creation of the dispatcher, and should be the highest shared protocol
-    /// version.
-    ///
-    // NOTE: currently unused, but will almost certainly be used in the futures
-    // as the protocol changes
-    #[allow(unused)]
-    pub(crate) proto_version: ProtocolVersion,
-}
-
-impl Dispatcher {
-    /// Creates a new dispatcher using the passed-in connection.
-    ///
-    /// Performs a negotiation with the agent to determine the highest protocol
-    /// version that both support. This consists of two steps:
-    /// 1. Wait for the agent to sent the range of protocols it supports.
-    /// 2. Send a protocol version that works for us as well, or an error if there
-    ///    is no compatible version.
-    pub async fn new(
-        stream: WebSocket,
-        notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
-        request_upscale_events: mpsc::Receiver<()>,
-    ) -> anyhow::Result<Self> {
-        let (mut sink, mut source) = stream.split();
-
-        // Figure out the highest protocol version we both support
-        info!("waiting for agent to send protocol version range");
-        let Some(message) = source.next().await else {
-            bail!("websocket connection closed while performing protocol handshake")
-        };
-
-        let message = message.context("failed to read protocol version range off connection")?;
-
-        let Message::Text(message_text) = message else {
-            // All messages should be in text form, since we don't do any
-            // pinging/ponging. See nhooyr/websocket's implementation and the
-            // agent for more info
-            bail!("received non-text message during proocol handshake: {message:?}")
-        };
-
-        let monitor_range = ProtocolRange {
-            min: PROTOCOL_MIN_VERSION,
-            max: PROTOCOL_MAX_VERSION,
-        };
-
-        let agent_range: ProtocolRange = serde_json::from_str(&message_text)
-            .context("failed to deserialize protocol version range")?;
-
-        info!(range = ?agent_range, "received protocol version range");
-
-        let highest_shared_version = match monitor_range.highest_shared_version(&agent_range) {
-            Ok(version) => {
-                sink.send(Message::Text(
-                    serde_json::to_string(&ProtocolResponse::Version(version)).unwrap(),
-                ))
-                .await
-                .context("failed to notify agent of negotiated protocol version")?;
-                version
-            }
-            Err(e) => {
-                sink.send(Message::Text(
-                    serde_json::to_string(&ProtocolResponse::Error(format!(
-                        "Received protocol version range {} which does not overlap with {}",
-                        agent_range, monitor_range
-                    )))
-                    .unwrap(),
-                ))
-                .await
-                .context("failed to notify agent of no overlap between protocol version ranges")?;
-                Err(e).context("error determining suitable protocol version range")?
-            }
-        };
-
-        Ok(Self {
-            sink,
-            source,
-            notify_upscale_events,
-            request_upscale_events,
-            proto_version: highest_shared_version,
-        })
-    }
-
-    /// Notify the cgroup manager that we have received upscale and wait for
-    /// the acknowledgement.
-    #[tracing::instrument(skip_all, fields(?resources))]
-    pub async fn notify_upscale(&self, resources: Sequenced<Resources>) -> anyhow::Result<()> {
-        self.notify_upscale_events
-            .send(resources)
-            .await
-            .context("failed to send resources and oneshot sender across channel")
-    }
-
-    /// Send a message to the agent.
-    ///
-    /// Although this function is small, it has one major benefit: it is the only
-    /// way to send data accross the connection, and you can only pass in a proper
-    /// `MonitorMessage`. Without safeguards like this, it's easy to accidentally
-    /// serialize the wrong thing and send it, since `self.sink.send` will take
-    /// any string.
-    pub async fn send(&mut self, message: OutboundMsg) -> anyhow::Result<()> {
-        info!(?message, "sending message");
-        let json = serde_json::to_string(&message).context("failed to serialize message")?;
-        self.sink
-            .send(Message::Text(json))
-            .await
-            .context("stream error sending message")
-    }
-}
--- a/libs/vm_monitor/src/filecache.rs
+++ b/libs/vm_monitor/src/filecache.rs
@@ -1,306 +0,0 @@
-//! Logic for configuring and scaling the Postgres file cache.
-
-use std::num::NonZeroU64;
-
-use crate::MiB;
-use anyhow::{anyhow, Context};
-use tokio_postgres::{types::ToSql, Client, NoTls, Row};
-use tokio_util::sync::CancellationToken;
-use tracing::{error, info};
-
-/// Manages Postgres' file cache by keeping a connection open.
-#[derive(Debug)]
-pub struct FileCacheState {
-    client: Client,
-    conn_str: String,
-    pub(crate) config: FileCacheConfig,
-
-    /// A token for cancelling spawned threads during shutdown.
-    token: CancellationToken,
-}
-
-#[derive(Debug)]
-pub struct FileCacheConfig {
-    /// Whether the file cache is *actually* stored in memory (e.g. by writing to
-    /// a tmpfs or shmem file). If true, the size of the file cache will be counted against the
-    /// memory available for the cgroup.
-    pub(crate) in_memory: bool,
-
-    /// The size of the file cache, in terms of the size of the resource it consumes
-    /// (currently: only memory)
-    ///
-    /// For example, setting `resource_multipler = 0.75` gives the cache a target size of 75% of total
-    /// resources.
-    ///
-    /// This value must be strictly between 0 and 1.
-    resource_multiplier: f64,
-
-    /// The required minimum amount of memory, in bytes, that must remain available
-    /// after subtracting the file cache.
-    ///
-    /// This value must be non-zero.
-    min_remaining_after_cache: NonZeroU64,
-
-    /// Controls the rate of increase in the file cache's size as it grows from zero
-    /// (when total resources equals min_remaining_after_cache) to the desired size based on
-    /// `resource_multiplier`.
-    ///
-    /// A `spread_factor` of zero means that all additional resources will go to the cache until it
-    /// reaches the desired size. Setting `spread_factor` to N roughly means "for every 1 byte added to
-    /// the cache's size, N bytes are reserved for the rest of the system, until the cache gets to
-    /// its desired size".
-    ///
-    /// This value must be >= 0, and must retain an increase that is more than what would be given by
-    /// `resource_multiplier`. For example, setting `resource_multiplier` = 0.75 but `spread_factor` = 1
-    /// would be invalid, because `spread_factor` would induce only 50% usage - never reaching the 75%
-    /// as desired by `resource_multiplier`.
-    ///
-    /// `spread_factor` is too large if `(spread_factor + 1) * resource_multiplier >= 1`.
-    spread_factor: f64,
-}
-
-impl Default for FileCacheConfig {
-    fn default() -> Self {
-        Self {
-            in_memory: true,
-            // 75 %
-            resource_multiplier: 0.75,
-            // 640 MiB; (512 + 128)
-            min_remaining_after_cache: NonZeroU64::new(640 * MiB).unwrap(),
-            // ensure any increase in file cache size is split 90-10 with 10% to other memory
-            spread_factor: 0.1,
-        }
-    }
-}
-
-impl FileCacheConfig {
-    /// Make sure fields of the config are consistent.
-    pub fn validate(&self) -> anyhow::Result<()> {
-        // Single field validity
-        anyhow::ensure!(
-            0.0 < self.resource_multiplier && self.resource_multiplier < 1.0,
-            "resource_multiplier must be between 0.0 and 1.0 exclusive, got {}",
-            self.resource_multiplier
-        );
-        anyhow::ensure!(
-            self.spread_factor >= 0.0,
-            "spread_factor must be >= 0, got {}",
-            self.spread_factor
-        );
-
-        // Check that `resource_multiplier` and `spread_factor` are valid w.r.t. each other.
-        //
-        // As shown in `calculate_cache_size`, we have two lines resulting from `resource_multiplier` and
-        // `spread_factor`, respectively. They are:
-        //
-        //                 `total`           `min_remaining_after_cache`
-        //   size = ————————————————————— - —————————————————————————————
-        //           `spread_factor` + 1         `spread_factor` + 1
-        //
-        // and
-        //
-        //   size = `resource_multiplier` × total
-        //
-        // .. where `total` is the total resources. These are isomorphic to the typical 'y = mx + b'
-        // form, with y = "size" and x = "total".
-        //
-        // These lines intersect at:
-        //
-        //               `min_remaining_after_cache`
-        //   ———————————————————————————————————————————————————
-        //    1 - `resource_multiplier` × (`spread_factor` + 1)
-        //
-        // We want to ensure that this value (a) exists, and (b) is >= `min_remaining_after_cache`. This is
-        // guaranteed when '`resource_multiplier` × (`spread_factor` + 1)' is less than 1.
-        // (We also need it to be >= 0, but that's already guaranteed.)
-
-        let intersect_factor = self.resource_multiplier * (self.spread_factor + 1.0);
-        anyhow::ensure!(
-            intersect_factor < 1.0,
-            "incompatible resource_multipler and spread_factor"
-        );
-        Ok(())
-    }
-
-    /// Calculate the desired size of the cache, given the total memory
-    pub fn calculate_cache_size(&self, total: u64) -> u64 {
-        // *Note*: all units are in bytes, until the very last line.
-        let available = total.saturating_sub(self.min_remaining_after_cache.get());
-        if available == 0 {
-            return 0;
-        }
-
-        // Conversions to ensure we don't overflow from floating-point ops
-        let size_from_spread =
-            i64::max(0, (available as f64 / (1.0 + self.spread_factor)) as i64) as u64;
-
-        let size_from_normal = (total as f64 * self.resource_multiplier) as u64;
-
-        let byte_size = u64::min(size_from_spread, size_from_normal);
-
-        // The file cache operates in units of mebibytes, so the sizes we produce should
-        // be rounded to a mebibyte. We round down to be conservative.
-        byte_size / MiB * MiB
-    }
-}
-
-impl FileCacheState {
-    /// Connect to the file cache.
-    #[tracing::instrument(skip_all, fields(%conn_str, ?config))]
-    pub async fn new(
-        conn_str: &str,
-        config: FileCacheConfig,
-        token: CancellationToken,
-    ) -> anyhow::Result<Self> {
-        config.validate().context("file cache config is invalid")?;
-
-        info!(conn_str, "connecting to Postgres file cache");
-        let client = FileCacheState::connect(conn_str, token.clone())
-            .await
-            .context("failed to connect to postgres file cache")?;
-
-        let conn_str = conn_str.to_string();
-        Ok(Self {
-            client,
-            config,
-            conn_str,
-            token,
-        })
-    }
-
-    /// Connect to Postgres.
-    ///
-    /// Aborts the spawned thread if the kill signal is received. This is not
-    /// a method as it is called in [`FileCacheState::new`].
-    #[tracing::instrument(skip_all, fields(%conn_str))]
-    async fn connect(conn_str: &str, token: CancellationToken) -> anyhow::Result<Client> {
-        let (client, conn) = tokio_postgres::connect(conn_str, NoTls)
-            .await
-            .context("failed to connect to pg client")?;
-
-        // The connection object performs the actual communication with the database,
-        // so spawn it off to run on its own. See tokio-postgres docs.
-        crate::spawn_with_cancel(
-            token,
-            |res| {
-                if let Err(error) = res {
-                    error!(%error, "postgres error")
-                }
-            },
-            conn,
-        );
-
-        Ok(client)
-    }
-
-    /// Execute a query with a retry if necessary.
-    ///
-    /// If the initial query fails, we restart the database connection and attempt
-    /// if again.
-    #[tracing::instrument(skip_all, fields(%statement))]
-    pub async fn query_with_retry(
-        &mut self,
-        statement: &str,
-        params: &[&(dyn ToSql + Sync)],
-    ) -> anyhow::Result<Vec<Row>> {
-        match self
-            .client
-            .query(statement, params)
-            .await
-            .context("failed to execute query")
-        {
-            Ok(rows) => Ok(rows),
-            Err(e) => {
-                error!(error = ?e, "postgres error: {e} -> retrying");
-
-                let client = FileCacheState::connect(&self.conn_str, self.token.clone())
-                    .await
-                    .context("failed to connect to postgres file cache")?;
-                info!("successfully reconnected to postgres client");
-
-                // Replace the old client and attempt the query with the new one
-                self.client = client;
-                self.client
-                    .query(statement, params)
-                    .await
-                    .context("failed to execute query a second time")
-            }
-        }
-    }
-
-    /// Get the current size of the file cache.
-    #[tracing::instrument(skip_all)]
-    pub async fn get_file_cache_size(&mut self) -> anyhow::Result<u64> {
-        self.query_with_retry(
-            // The file cache GUC variable is in MiB, but the conversion with
-            // pg_size_bytes means that the end result we get is in bytes.
-            "SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit'));",
-            &[],
-        )
-        .await
-        .context("failed to query pg for file cache size")?
-        .first()
-        .ok_or_else(|| anyhow!("file cache size query returned no rows"))?
-        // pg_size_bytes returns a bigint which is the same as an i64.
-        .try_get::<_, i64>(0)
-        // Since the size of the table is not negative, the cast is sound.
-        .map(|bytes| bytes as u64)
-        .context("failed to extract file cache size from query result")
-    }
-
-    /// Attempt to set the file cache size, returning the size it was actually
-    /// set to.
-    #[tracing::instrument(skip_all, fields(%num_bytes))]
-    pub async fn set_file_cache_size(&mut self, num_bytes: u64) -> anyhow::Result<u64> {
-        let max_bytes = self
-            // The file cache GUC variable is in MiB, but the conversion with pg_size_bytes
-            // means that the end result we get is in bytes.
-            .query_with_retry(
-                "SELECT pg_size_bytes(current_setting('neon.max_file_cache_size'));",
-                &[],
-            )
-            .await
-            .context("failed to query pg for max file cache size")?
-            .first()
-            .ok_or_else(|| anyhow!("max file cache size query returned no rows"))?
-            .try_get::<_, i64>(0)
-            .map(|bytes| bytes as u64)
-            .context("failed to extract max file cache size from query result")?;
-
-        let max_mb = max_bytes / MiB;
-        let num_mb = u64::min(num_bytes, max_bytes) / MiB;
-
-        let capped = if num_bytes > max_bytes {
-            " (capped by maximum size)"
-        } else {
-            ""
-        };
-
-        info!(
-            size = num_mb,
-            max = max_mb,
-            "updating file cache size {capped}",
-        );
-
-        // note: even though the normal ways to get the cache size produce values with trailing "MB"
-        // (hence why we call pg_size_bytes in `get_file_cache_size`'s query), the format
-        // it expects to set the value is "integer number of MB" without trailing units.
-        // For some reason, this *really* wasn't working with normal arguments, so that's
-        // why we're constructing the query here.
-        self.client
-            .query(
-                &format!("ALTER SYSTEM SET neon.file_cache_size_limit = {};", num_mb),
-                &[],
-            )
-            .await
-            .context("failed to change file cache size limit")?;
-
-        // must use pg_reload_conf to have the settings change take effect
-        self.client
-            .execute("SELECT pg_reload_conf();", &[])
-            .await
-            .context("failed to reload config")?;
-
-        Ok(num_mb * MiB)
-    }
-}
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -1,205 +0,0 @@
-#![cfg(target_os = "linux")]
-
-use anyhow::Context;
-use axum::{
-    extract::{ws::WebSocket, State, WebSocketUpgrade},
-    response::Response,
-};
-use axum::{routing::get, Router, Server};
-use clap::Parser;
-use futures::Future;
-use std::{fmt::Debug, time::Duration};
-use sysinfo::{RefreshKind, System, SystemExt};
-use tokio::{sync::broadcast, task::JoinHandle};
-use tokio_util::sync::CancellationToken;
-use tracing::{error, info};
-
-use runner::Runner;
-
-// Code that interfaces with agent
-pub mod dispatcher;
-pub mod protocol;
-
-pub mod cgroup;
-pub mod filecache;
-pub mod runner;
-
-/// The vm-monitor is an autoscaling component started by compute_ctl.
-///
-/// It carries out autoscaling decisions (upscaling/downscaling) and responds to
-/// memory pressure by making requests to the autoscaler-agent.
-#[derive(Debug, Parser)]
-pub struct Args {
-    /// The name of the cgroup we should monitor for memory.high events. This
-    /// is the cgroup that postgres should be running in.
-    #[arg(short, long)]
-    pub cgroup: Option<String>,
-
-    /// The connection string for the Postgres file cache we should manage.
-    #[arg(short, long)]
-    pub pgconnstr: Option<String>,
-
-    /// The address we should listen on for connection requests. For the
-    /// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
-    #[arg(short, long)]
-    pub addr: String,
-}
-
-impl Args {
-    pub fn addr(&self) -> &str {
-        &self.addr
-    }
-}
-
-/// The number of bytes in one mebibyte.
-#[allow(non_upper_case_globals)]
-const MiB: u64 = 1 << 20;
-
-/// Convert a quantity in bytes to a quantity in mebibytes, generally for display
-/// purposes. (Most calculations in this crate use bytes directly)
-pub fn bytes_to_mebibytes(bytes: u64) -> f32 {
-    (bytes as f32) / (MiB as f32)
-}
-
-pub fn get_total_system_memory() -> u64 {
-    System::new_with_specifics(RefreshKind::new().with_memory()).total_memory()
-}
-
-/// Global app state for the Axum server
-#[derive(Debug, Clone)]
-pub struct ServerState {
-    /// Used to close old connections.
-    ///
-    /// When a new connection is made, we send a message signalling to the old
-    /// connection to close.
-    pub sender: broadcast::Sender<()>,
-
-    /// Used to cancel all spawned threads in the monitor.
-    pub token: CancellationToken,
-
-    // The CLI args
-    pub args: &'static Args,
-}
-
-/// Spawn a thread that may get cancelled by the provided [`CancellationToken`].
-///
-/// This is mainly meant to be called with futures that will be pending for a very
-/// long time, or are not mean to return. If it is not desirable for the future to
-/// ever resolve, such as in the case of [`cgroup::CgroupWatcher::watch`], the error can
-/// be logged with `f`.
-pub fn spawn_with_cancel<T, F>(
-    token: CancellationToken,
-    f: F,
-    future: T,
-) -> JoinHandle<Option<T::Output>>
-where
-    T: Future + Send + 'static,
-    T::Output: Send + 'static,
-    F: FnOnce(&T::Output) + Send + 'static,
-{
-    tokio::spawn(async move {
-        tokio::select! {
-            _ = token.cancelled() => {
-                info!("received global kill signal");
-                None
-            }
-            res = future => {
-                f(&res);
-                Some(res)
-            }
-        }
-    })
-}
-
-/// The entrypoint to the binary.
-///
-/// Set up tracing, parse arguments, and start an http server.
-pub async fn start(args: &'static Args, token: CancellationToken) -> anyhow::Result<()> {
-    // This channel is used to close old connections. When a new connection is
-    // made, we send a message signalling to the old connection to close.
-    let (sender, _) = tokio::sync::broadcast::channel::<()>(1);
-
-    let app = Router::new()
-        // This route gets upgraded to a websocket connection. We only support
-        // one connection at a time, which we enforce by killing old connections
-        // when we receive a new one.
-        .route("/monitor", get(ws_handler))
-        .with_state(ServerState {
-            sender,
-            token,
-            args,
-        });
-
-    let addr = args.addr();
-    let bound = Server::try_bind(&addr.parse().expect("parsing address should not fail"))
-        .with_context(|| format!("failed to bind to {addr}"))?;
-
-    info!(addr, "server bound");
-
-    bound
-        .serve(app.into_make_service())
-        .await
-        .context("server exited")?;
-
-    Ok(())
-}
-
-/// Handles incoming websocket connections.
-///
-/// If we are already to connected to an agent, we kill that old connection
-/// and accept the new one.
-#[tracing::instrument(name = "/monitor", skip_all, fields(?args))]
-pub async fn ws_handler(
-    ws: WebSocketUpgrade,
-    State(ServerState {
-        sender,
-        token,
-        args,
-    }): State<ServerState>,
-) -> Response {
-    // Kill the old monitor
-    info!("closing old connection if there is one");
-    let _ = sender.send(());
-
-    // Start the new one. Wow, the cycle of death and rebirth
-    let closer = sender.subscribe();
-    ws.on_upgrade(|ws| start_monitor(ws, args, closer, token))
-}
-
-/// Starts the monitor. If startup fails or the monitor exits, an error will
-/// be logged and our internal state will be reset to allow for new connections.
-#[tracing::instrument(skip_all, fields(?args))]
-async fn start_monitor(
-    ws: WebSocket,
-    args: &Args,
-    kill: broadcast::Receiver<()>,
-    token: CancellationToken,
-) {
-    info!("accepted new websocket connection -> starting monitor");
-    let timeout = Duration::from_secs(4);
-    let monitor = tokio::time::timeout(
-        timeout,
-        Runner::new(Default::default(), args, ws, kill, token),
-    )
-    .await;
-    let mut monitor = match monitor {
-        Ok(Ok(monitor)) => monitor,
-        Ok(Err(error)) => {
-            error!(?error, "failed to create monitor");
-            return;
-        }
-        Err(_) => {
-            error!(
-                ?timeout,
-                "creating monitor timed out (probably waiting to receive protocol range)"
-            );
-            return;
-        }
-    };
-    info!("connected to agent");
-
-    match monitor.run().await {
-        Ok(()) => info!("monitor was killed due to new connection"),
-        Err(e) => error!(error = ?e, "monitor terminated unexpectedly"),
-    }
-}
--- a/libs/vm_monitor/src/protocol.rs
+++ b/libs/vm_monitor/src/protocol.rs
@@ -1,241 +0,0 @@
-//! Types representing protocols and actual agent-monitor messages.
-//!
-//! The pervasive use of serde modifiers throughout this module is to ease
-//! serialization on the go side. Because go does not have enums (which model
-//! messages well), it is harder to model messages, and we accomodate that with
-//! serde.
-//!
-//! *Note*: the agent sends and receives messages in different ways.
-//!
-//! The agent serializes messages in the form and then sends them. The use
-//! of `#[serde(tag = "type", content = "content")]` allows us to use `Type`
-//! to determine how to deserialize `Content`.
-//! ```ignore
-//! struct {
-//!     Content any
-//!     Type    string
-//!     Id      uint64
-//! }
-//! ```
-//! and receives messages in the form:
-//! ```ignore
-//! struct {
-//!     {fields embedded}
-//!     Type string
-//!     Id   uint64
-//! }
-//! ```
-//! After reading the type field, the agent will decode the entire message
-//! again, this time into the correct type using the embedded fields.
-//! Because the agent cannot just extract the json contained in a certain field
-//! (it initially deserializes to `map[string]interface{}`), we keep the fields
-//! at the top level, so the entire piece of json can be deserialized into a struct,
-//! such as a `DownscaleResult`, with the `Type` and `Id` fields ignored.
-
-use core::fmt;
-use std::cmp;
-
-use serde::{de::Error, Deserialize, Serialize};
-
-/// A Message we send to the agent.
-#[derive(Serialize, Deserialize, Debug, Clone)]
-pub struct OutboundMsg {
-    #[serde(flatten)]
-    pub(crate) inner: OutboundMsgKind,
-    pub(crate) id: usize,
-}
-
-impl OutboundMsg {
-    pub fn new(inner: OutboundMsgKind, id: usize) -> Self {
-        Self { inner, id }
-    }
-}
-
-/// The different underlying message types we can send to the agent.
-#[derive(Serialize, Deserialize, Debug, Clone)]
-#[serde(tag = "type")]
-pub enum OutboundMsgKind {
-    /// Indicates that the agent sent an invalid message, i.e, we couldn't
-    /// properly deserialize it.
-    InvalidMessage { error: String },
-    /// Indicates that we experienced an internal error while processing a message.
-    /// For example, if a cgroup operation fails while trying to handle an upscale,
-    /// we return `InternalError`.
-    InternalError { error: String },
-    /// Returned to the agent once we have finished handling an upscale. If the
-    /// handling was unsuccessful, an `InternalError` will get returned instead.
-    /// *Note*: this is a struct variant because of the way go serializes struct{}
-    UpscaleConfirmation {},
-    /// Indicates to the monitor that we are urgently requesting resources.
-    /// *Note*: this is a struct variant because of the way go serializes struct{}
-    UpscaleRequest {},
-    /// Returned to the agent once we have finished attempting to downscale. If
-    /// an error occured trying to do so, an `InternalError` will get returned instead.
-    /// However, if we are simply unsuccessful (for example, do to needing the resources),
-    /// that gets included in the `DownscaleResult`.
-    DownscaleResult {
-        // FIXME for the future (once the informant is deprecated)
-        // As of the time of writing, the agent/informant version of this struct is
-        // called api.DownscaleResult. This struct has uppercase fields which are
-        // serialized as such. Thus, we serialize using uppercase names so we don't
-        // have to make a breaking change to the agent<->informant protocol. Once
-        // the informant has been superseded by the monitor, we can add the correct
-        // struct tags to api.DownscaleResult without causing a breaking change,
-        // since we don't need to support the agent<->informant protocol anymore.
-        #[serde(rename = "Ok")]
-        ok: bool,
-        #[serde(rename = "Status")]
-        status: String,
-    },
-    /// Part of the bidirectional heartbeat. The heartbeat is initiated by the
-    /// agent.
-    /// *Note*: this is a struct variant because of the way go serializes struct{}
-    HealthCheck {},
-}
-
-/// A message received form the agent.
-#[derive(Serialize, Deserialize, Debug, Clone)]
-pub struct InboundMsg {
-    #[serde(flatten)]
-    pub(crate) inner: InboundMsgKind,
-    pub(crate) id: usize,
-}
-
-/// The different underlying message types we can receive from the agent.
-#[derive(Serialize, Deserialize, Debug, Clone)]
-#[serde(tag = "type", content = "content")]
-pub enum InboundMsgKind {
-    /// Indicates that the we sent an invalid message, i.e, we couldn't
-    /// properly deserialize it.
-    InvalidMessage { error: String },
-    /// Indicates that the informan experienced an internal error while processing
-    /// a message. For example, if it failed to request upsacle from the agent, it
-    /// would return an `InternalError`.
-    InternalError { error: String },
-    /// Indicates to us that we have been granted more resources. We should respond
-    /// with an `UpscaleConfirmation` when done handling the resources (increasins
-    /// file cache size, cgorup memory limits).
-    UpscaleNotification { granted: Resources },
-    /// A request to reduce resource usage. We should response with a `DownscaleResult`,
-    /// when done.
-    DownscaleRequest { target: Resources },
-    /// Part of the bidirectional heartbeat. The heartbeat is initiated by the
-    /// agent.
-    /// *Note*: this is a struct variant because of the way go serializes struct{}
-    HealthCheck {},
-}
-
-/// Represents the resources granted to a VM.
-#[derive(Serialize, Deserialize, Debug, Clone, Copy)]
-// Renamed because the agent has multiple resources types:
-// `Resources` (milliCPU/memory slots)
-// `Allocation` (vCPU/bytes) <- what we correspond to
-#[serde(rename(serialize = "Allocation", deserialize = "Allocation"))]
-pub struct Resources {
-    /// Number of vCPUs
-    pub(crate) cpu: f64,
-    /// Bytes of memory
-    pub(crate) mem: u64,
-}
-
-impl Resources {
-    pub fn new(cpu: f64, mem: u64) -> Self {
-        Self { cpu, mem }
-    }
-}
-
-pub const PROTOCOL_MIN_VERSION: ProtocolVersion = ProtocolVersion::V1_0;
-pub const PROTOCOL_MAX_VERSION: ProtocolVersion = ProtocolVersion::V1_0;
-
-#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Ord, Eq, Serialize, Deserialize)]
-pub struct ProtocolVersion(u8);
-
-impl ProtocolVersion {
-    /// Represents v1.0 of the agent<-> monitor protocol - the initial version
-    ///
-    /// Currently the latest version.
-    const V1_0: ProtocolVersion = ProtocolVersion(1);
-}
-
-impl fmt::Display for ProtocolVersion {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        match *self {
-            ProtocolVersion(0) => f.write_str("<invalid: zero>"),
-            ProtocolVersion::V1_0 => f.write_str("v1.0"),
-            other => write!(f, "<unknown: {other}>"),
-        }
-    }
-}
-
-/// A set of protocol bounds that determines what we are speaking.
-///
-/// These bounds are inclusive.
-#[derive(Debug)]
-pub struct ProtocolRange {
-    pub min: ProtocolVersion,
-    pub max: ProtocolVersion,
-}
-
-// Use a custom deserialize impl to ensure that `self.min <= self.max`
-impl<'de> Deserialize<'de> for ProtocolRange {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        #[derive(Deserialize)]
-        struct InnerProtocolRange {
-            min: ProtocolVersion,
-            max: ProtocolVersion,
-        }
-        let InnerProtocolRange { min, max } = InnerProtocolRange::deserialize(deserializer)?;
-        if min > max {
-            Err(D::Error::custom(format!(
-                "min version = {min} is greater than max version = {max}",
-            )))
-        } else {
-            Ok(ProtocolRange { min, max })
-        }
-    }
-}
-
-impl fmt::Display for ProtocolRange {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        if self.min == self.max {
-            f.write_fmt(format_args!("{}", self.max))
-        } else {
-            f.write_fmt(format_args!("{} to {}", self.min, self.max))
-        }
-    }
-}
-
-impl ProtocolRange {
-    /// Find the highest shared version between two `ProtocolRange`'s
-    pub fn highest_shared_version(&self, other: &Self) -> anyhow::Result<ProtocolVersion> {
-        // We first have to make sure the ranges are overlapping. Once we know
-        // this, we can merge the ranges by taking the max of the mins and the
-        // mins of the maxes.
-        if self.min > other.max {
-            anyhow::bail!(
-                "Non-overlapping bounds: other.max = {} was less than self.min = {}",
-                other.max,
-                self.min,
-            )
-        } else if self.max < other.min {
-            anyhow::bail!(
-                "Non-overlappinng bounds: self.max = {} was less than other.min = {}",
-                self.max,
-                other.min
-            )
-        } else {
-            Ok(cmp::min(self.max, other.max))
-        }
-    }
-}
-
-/// We send this to the monitor after negotiating which protocol to use
-#[derive(Serialize, Debug)]
-#[serde(rename_all = "camelCase")]
-pub enum ProtocolResponse {
-    Error(String),
-    Version(ProtocolVersion),
-}
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -1,460 +0,0 @@
-//! Exposes the `Runner`, which handles messages received from agent and
-//! sends upscale requests.
-//!
-//! This is the "Monitor" part of the monitor binary and is the main entrypoint for
-//! all functionality.
-
-use std::sync::Arc;
-use std::{fmt::Debug, mem};
-
-use anyhow::{bail, Context};
-use axum::extract::ws::{Message, WebSocket};
-use futures::StreamExt;
-use tokio::sync::broadcast;
-use tokio::sync::mpsc;
-use tokio_util::sync::CancellationToken;
-use tracing::{error, info, warn};
-
-use crate::cgroup::{CgroupWatcher, MemoryLimits, Sequenced};
-use crate::dispatcher::Dispatcher;
-use crate::filecache::{FileCacheConfig, FileCacheState};
-use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
-use crate::{bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel, Args, MiB};
-
-/// Central struct that interacts with agent, dispatcher, and cgroup to handle
-/// signals from the agent.
-#[derive(Debug)]
-pub struct Runner {
-    config: Config,
-    filecache: Option<FileCacheState>,
-    cgroup: Option<Arc<CgroupWatcher>>,
-    dispatcher: Dispatcher,
-
-    /// We "mint" new message ids by incrementing this counter and taking the value.
-    ///
-    /// **Note**: This counter is always odd, so that we avoid collisions between the IDs generated
-    /// by us vs the autoscaler-agent.
-    counter: usize,
-
-    /// A signal to kill the main thread produced by `self.run()`. This is triggered
-    /// when the server receives a new connection. When the thread receives the
-    /// signal off this channel, it will gracefully shutdown.
-    kill: broadcast::Receiver<()>,
-}
-
-/// Configuration for a `Runner`
-#[derive(Debug)]
-pub struct Config {
-    /// `sys_buffer_bytes` gives the estimated amount of memory, in bytes, that the kernel uses before
-    /// handing out the rest to userspace. This value is the estimated difference between the
-    /// *actual* physical memory and the amount reported by `grep MemTotal /proc/meminfo`.
-    ///
-    /// For more information, refer to `man 5 proc`, which defines MemTotal as "Total usable RAM
-    /// (i.e., physical RAM minus a few reserved bits and the kernel binary code)".
-    ///
-    /// We only use `sys_buffer_bytes` when calculating the system memory from the *external* memory
-    /// size, rather than the self-reported memory size, according to the kernel.
-    ///
-    /// TODO: this field is only necessary while we still have to trust the autoscaler-agent's
-    /// upscale resource amounts (because we might not *actually* have been upscaled yet). This field
-    /// should be removed once we have a better solution there.
-    sys_buffer_bytes: u64,
-}
-
-impl Default for Config {
-    fn default() -> Self {
-        Self {
-            sys_buffer_bytes: 100 * MiB,
-        }
-    }
-}
-
-impl Runner {
-    /// Create a new monitor.
-    #[tracing::instrument(skip_all, fields(?config, ?args))]
-    pub async fn new(
-        config: Config,
-        args: &Args,
-        ws: WebSocket,
-        kill: broadcast::Receiver<()>,
-        token: CancellationToken,
-    ) -> anyhow::Result<Runner> {
-        anyhow::ensure!(
-            config.sys_buffer_bytes != 0,
-            "invalid monitor Config: sys_buffer_bytes cannot be 0"
-        );
-
-        // *NOTE*: the dispatcher and cgroup manager talk through these channels
-        // so make sure they each get the correct half, nothing is droppped, etc.
-        let (notified_send, notified_recv) = mpsc::channel(1);
-        let (requesting_send, requesting_recv) = mpsc::channel(1);
-
-        let dispatcher = Dispatcher::new(ws, notified_send, requesting_recv)
-            .await
-            .context("error creating new dispatcher")?;
-
-        let mut state = Runner {
-            config,
-            filecache: None,
-            cgroup: None,
-            dispatcher,
-            counter: 1, // NB: must be odd, see the comment about the field for more.
-            kill,
-        };
-
-        let mut file_cache_reserved_bytes = 0;
-        let mem = get_total_system_memory();
-
-        // We need to process file cache initialization before cgroup initialization, so that the memory
-        // allocated to the file cache is appropriately taken into account when we decide the cgroup's
-        // memory limits.
-        if let Some(connstr) = &args.pgconnstr {
-            info!("initializing file cache");
-            let config: FileCacheConfig = Default::default();
-            if !config.in_memory {
-                panic!("file cache not in-memory implemented")
-            }
-
-            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
-                .await
-                .context("failed to create file cache")?;
-
-            let size = file_cache
-                .get_file_cache_size()
-                .await
-                .context("error getting file cache size")?;
-
-            let new_size = file_cache.config.calculate_cache_size(mem);
-            info!(
-                initial = bytes_to_mebibytes(size),
-                new = bytes_to_mebibytes(new_size),
-                "setting initial file cache size",
-            );
-
-            // note: even if size == new_size, we want to explicitly set it, just
-            // to make sure that we have the permissions to do so
-            let actual_size = file_cache
-                .set_file_cache_size(new_size)
-                .await
-                .context("failed to set file cache size, possibly due to inadequate permissions")?;
-            if actual_size != new_size {
-                info!("file cache size actually got set to {actual_size}")
-            }
-            file_cache_reserved_bytes = actual_size;
-
-            state.filecache = Some(file_cache);
-        }
-
-        if let Some(name) = &args.cgroup {
-            let (mut cgroup, cgroup_event_stream) =
-                CgroupWatcher::new(name.clone(), requesting_send)
-                    .context("failed to create cgroup manager")?;
-
-            let available = mem - file_cache_reserved_bytes;
-
-            cgroup
-                .set_memory_limits(available)
-                .context("failed to set cgroup memory limits")?;
-
-            let cgroup = Arc::new(cgroup);
-
-            // Some might call this . . . cgroup v2
-            let cgroup_clone = Arc::clone(&cgroup);
-
-            spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move {
-                cgroup_clone.watch(notified_recv, cgroup_event_stream).await
-            });
-
-            state.cgroup = Some(cgroup);
-        } else {
-            // *NOTE*: We need to forget the sender so that its drop impl does not get ran.
-            // This allows us to poll it in `Monitor::run` regardless of whether we
-            // are managing a cgroup or not. If we don't forget it, all receives will
-            // immediately return an error because the sender is droped and it will
-            // claim all select! statements, effectively turning `Monitor::run` into
-            // `loop { fail to receive }`.
-            mem::forget(requesting_send);
-        }
-
-        Ok(state)
-    }
-
-    /// Attempt to downscale filecache + cgroup
-    #[tracing::instrument(skip_all, fields(?target))]
-    pub async fn try_downscale(&mut self, target: Resources) -> anyhow::Result<(bool, String)> {
-        // Nothing to adjust
-        if self.cgroup.is_none() && self.filecache.is_none() {
-            info!("no action needed for downscale (no cgroup or file cache enabled)");
-            return Ok((
-                true,
-                "monitor is not managing cgroup or file cache".to_string(),
-            ));
-        }
-
-        let requested_mem = target.mem;
-        let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
-        let expected_file_cache_mem_usage = self
-            .filecache
-            .as_ref()
-            .map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
-            .unwrap_or(0);
-        let mut new_cgroup_mem_high = 0;
-        if let Some(cgroup) = &self.cgroup {
-            new_cgroup_mem_high = cgroup
-                .config
-                .calculate_memory_high_value(usable_system_memory - expected_file_cache_mem_usage);
-
-            let current = cgroup
-                .current_memory_usage()
-                .context("failed to fetch cgroup memory")?;
-
-            if new_cgroup_mem_high < current + cgroup.config.memory_high_buffer_bytes {
-                let status = format!(
-                    "{}: {} MiB (new high) < {} (current usage) + {} (buffer)",
-                    "calculated memory.high too low",
-                    bytes_to_mebibytes(new_cgroup_mem_high),
-                    bytes_to_mebibytes(current),
-                    bytes_to_mebibytes(cgroup.config.memory_high_buffer_bytes)
-                );
-
-                info!(status, "discontinuing downscale");
-
-                return Ok((false, status));
-            }
-        }
-
-        // The downscaling has been approved. Downscale the file cache, then the cgroup.
-        let mut status = vec![];
-        let mut file_cache_mem_usage = 0;
-        if let Some(file_cache) = &mut self.filecache {
-            if !file_cache.config.in_memory {
-                panic!("file cache not in-memory unimplemented")
-            }
-
-            let actual_usage = file_cache
-                .set_file_cache_size(expected_file_cache_mem_usage)
-                .await
-                .context("failed to set file cache size")?;
-            file_cache_mem_usage = actual_usage;
-            let message = format!(
-                "set file cache size to {} MiB",
-                bytes_to_mebibytes(actual_usage)
-            );
-            info!("downscale: {message}");
-            status.push(message);
-        }
-
-        if let Some(cgroup) = &self.cgroup {
-            let available_memory = usable_system_memory - file_cache_mem_usage;
-
-            if file_cache_mem_usage != expected_file_cache_mem_usage {
-                new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
-            }
-
-            let limits = MemoryLimits::new(
-                // new_cgroup_mem_high is initialized to 0 but it is guarancontextd to not be here
-                // since it is properly initialized in the previous cgroup if let block
-                new_cgroup_mem_high,
-                available_memory,
-            );
-            cgroup
-                .set_limits(&limits)
-                .context("failed to set cgroup memory limits")?;
-
-            let message = format!(
-                "set cgroup memory.high to {} MiB, of new max {} MiB",
-                bytes_to_mebibytes(new_cgroup_mem_high),
-                bytes_to_mebibytes(available_memory)
-            );
-            info!("downscale: {message}");
-            status.push(message);
-        }
-
-        // TODO: make this status thing less jank
-        let status = status.join("; ");
-        Ok((true, status))
-    }
-
-    /// Handle new resources
-    #[tracing::instrument(skip_all, fields(?resources))]
-    pub async fn handle_upscale(&mut self, resources: Resources) -> anyhow::Result<()> {
-        if self.filecache.is_none() && self.cgroup.is_none() {
-            info!("no action needed for upscale (no cgroup or file cache enabled)");
-            return Ok(());
-        }
-
-        let new_mem = resources.mem;
-        let usable_system_memory = new_mem.saturating_sub(self.config.sys_buffer_bytes);
-
-        // Get the file cache's expected contribution to the memory usage
-        let mut file_cache_mem_usage = 0;
-        if let Some(file_cache) = &mut self.filecache {
-            if !file_cache.config.in_memory {
-                panic!("file cache not in-memory unimplemented");
-            }
-
-            let expected_usage = file_cache.config.calculate_cache_size(usable_system_memory);
-            info!(
-                target = bytes_to_mebibytes(expected_usage),
-                total = bytes_to_mebibytes(new_mem),
-                "updating file cache size",
-            );
-
-            let actual_usage = file_cache
-                .set_file_cache_size(expected_usage)
-                .await
-                .context("failed to set file cache size")?;
-
-            if actual_usage != expected_usage {
-                warn!(
-                    "file cache was set to a different size that we wanted: target = {} Mib, actual= {} Mib",
-                    bytes_to_mebibytes(expected_usage),
-                    bytes_to_mebibytes(actual_usage)
-                )
-            }
-            file_cache_mem_usage = actual_usage;
-        }
-
-        if let Some(cgroup) = &self.cgroup {
-            let available_memory = usable_system_memory - file_cache_mem_usage;
-            let new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
-            info!(
-                target = bytes_to_mebibytes(new_cgroup_mem_high),
-                total = bytes_to_mebibytes(new_mem),
-                name = cgroup.path(),
-                "updating cgroup memory.high",
-            );
-            let limits = MemoryLimits::new(new_cgroup_mem_high, available_memory);
-            cgroup
-                .set_limits(&limits)
-                .context("failed to set file cache size")?;
-        }
-
-        Ok(())
-    }
-
-    /// Take in a message and perform some action, such as downscaling or upscaling,
-    /// and return a message to be send back.
-    #[tracing::instrument(skip_all, fields(%id, message = ?inner))]
-    pub async fn process_message(
-        &mut self,
-        InboundMsg { inner, id }: InboundMsg,
-    ) -> anyhow::Result<Option<OutboundMsg>> {
-        match inner {
-            InboundMsgKind::UpscaleNotification { granted } => {
-                self.handle_upscale(granted)
-                    .await
-                    .context("failed to handle upscale")?;
-                self.dispatcher
-                    .notify_upscale(Sequenced::new(granted))
-                    .await
-                    .context("failed to notify notify cgroup of upscale")?;
-                Ok(Some(OutboundMsg::new(
-                    OutboundMsgKind::UpscaleConfirmation {},
-                    id,
-                )))
-            }
-            InboundMsgKind::DownscaleRequest { target } => self
-                .try_downscale(target)
-                .await
-                .context("failed to downscale")
-                .map(|(ok, status)| {
-                    Some(OutboundMsg::new(
-                        OutboundMsgKind::DownscaleResult { ok, status },
-                        id,
-                    ))
-                }),
-            InboundMsgKind::InvalidMessage { error } => {
-                warn!(
-                    %error, id, "received notification of an invalid message we sent"
-                );
-                Ok(None)
-            }
-            InboundMsgKind::InternalError { error } => {
-                warn!(error, id, "agent experienced an internal error");
-                Ok(None)
-            }
-            InboundMsgKind::HealthCheck {} => {
-                Ok(Some(OutboundMsg::new(OutboundMsgKind::HealthCheck {}, id)))
-            }
-        }
-    }
-
-    // TODO: don't propagate errors, probably just warn!?
-    #[tracing::instrument(skip_all)]
-    pub async fn run(&mut self) -> anyhow::Result<()> {
-        info!("starting dispatcher");
-        loop {
-            tokio::select! {
-                signal = self.kill.recv() => {
-                    match signal {
-                        Ok(()) => return Ok(()),
-                        Err(e) => bail!("failed to receive kill signal: {e}")
-                    }
-                }
-                // we need to propagate an upscale request
-                request = self.dispatcher.request_upscale_events.recv() => {
-                    if request.is_none() {
-                        bail!("failed to listen for upscale event from cgroup")
-                    }
-                    info!("cgroup asking for upscale; forwarding request");
-                    self.counter += 2; // Increment, preserving parity (i.e. keep the
-                                       // counter odd). See the field comment for more.
-                    self.dispatcher
-                        .send(OutboundMsg::new(OutboundMsgKind::UpscaleRequest {}, self.counter))
-                        .await
-                        .context("failed to send message")?;
-                }
-                // there is a message from the agent
-                msg = self.dispatcher.source.next() => {
-                    if let Some(msg) = msg {
-                        // Don't use 'message' as a key as the string also uses
-                        // that for its key
-                        info!(?msg, "received message");
-                        match msg {
-                            Ok(msg) => {
-                                let message: InboundMsg = match msg {
-                                    Message::Text(text) => {
-                                        serde_json::from_str(&text).context("failed to deserialize text message")?
-                                    }
-                                    other => {
-                                        warn!(
-                                            // Don't use 'message' as a key as the
-                                            // string also uses that for its key
-                                            msg = ?other,
-                                            "agent should only send text messages but received different type"
-                                        );
-                                        continue
-                                    },
-                                };
-
-                                let out = match self.process_message(message.clone()).await {
-                                    Ok(Some(out)) => out,
-                                    Ok(None) => continue,
-                                    Err(e) => {
-                                        let error = e.to_string();
-                                        warn!(?error, "error handling message");
-                                        OutboundMsg::new(
-                                            OutboundMsgKind::InternalError {
-                                                error
-                                            },
-                                            message.id
-                                        )
-                                    }
-                                };
-
-                                self.dispatcher
-                                    .send(out)
-                                    .await
-                                    .context("failed to send message")?;
-                            }
-                            Err(e) => warn!("{e}"),
-                        }
-                    } else {
-                        anyhow::bail!("dispatcher connection closed")
-                    }
-                }
-            }
-        }
-    }
-}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -51,7 +51,6 @@ serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 serde_with.workspace = true
 signal-hook.workspace = true
-smallvec = { workspace = true, features = ["write"] }
 svg_fmt.workspace = true
 sync_wrapper.workspace = true
 tokio-tar.workspace = true
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -10,7 +10,7 @@ use std::{fs, path::Path, str};

 use pageserver::page_cache::PAGE_SZ;
 use pageserver::repository::{Key, KEY_SIZE};
-use pageserver::tenant::block_io::FileBlockReader;
+use pageserver::tenant::block_io::{BlockReader, FileBlockReader};
 use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
 use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
 use pageserver::tenant::storage_layer::range_overlaps;
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -44,6 +44,8 @@ pub(crate) enum LayerCmd {
 }

 async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
+    use pageserver::tenant::block_io::BlockReader;
+
    let path = path.as_ref();
    virtual_file::init(10);
    page_cache::init(100);
@@ -68,7 +70,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
            },
        )
        .await?;
-    let cursor = BlockCursor::new_fileblockreader_virtual(&file);
+    let cursor = BlockCursor::new(&file);
    for (k, v) in all {
        let value = cursor.read_blob(v.pos()).await?;
        println!("key:{} value_len:{}", k, value.len());
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -203,11 +203,6 @@ impl Slot {
            Err(usage_count) => usage_count,
        }
    }
-
-    /// Sets the usage count to a specific value.
-    fn set_usage_count(&self, count: u8) {
-        self.usage_count.store(count, Ordering::Relaxed);
-    }
 }

 pub struct PageCache {
@@ -268,7 +263,6 @@ pub struct PageWriteGuard<'i> {
    inner: RwLockWriteGuard<'i, SlotInner>,

    // Are the page contents currently valid?
-    // Used to mark pages as invalid that are assigned but not yet filled with data.
    valid: bool,
 }

@@ -431,6 +425,27 @@ impl PageCache {
        self.lock_for_read(&mut cache_key)
    }

+    /// Immediately drop all buffers belonging to given file
+    pub fn drop_buffers_for_immutable(&self, drop_file_id: FileId) {
+        for slot_idx in 0..self.slots.len() {
+            let slot = &self.slots[slot_idx];
+
+            let mut inner = slot.inner.write().unwrap();
+            if let Some(key) = &inner.key {
+                match key {
+                    CacheKey::ImmutableFilePage { file_id, blkno: _ }
+                        if *file_id == drop_file_id =>
+                    {
+                        // remove mapping for old buffer
+                        self.remove_mapping(key);
+                        inner.key = None;
+                    }
+                    _ => {}
+                }
+            }
+        }
+    }
+
    //
    // Section 2: Internal interface functions for lookup/update.
    //
@@ -541,7 +556,7 @@ impl PageCache {
            // Make the slot ready
            let slot = &self.slots[slot_idx];
            inner.key = Some(cache_key.clone());
-            slot.set_usage_count(1);
+            slot.usage_count.store(1, Ordering::Relaxed);

            return Ok(ReadBufResult::NotFound(PageWriteGuard {
                inner,
@@ -602,7 +617,7 @@ impl PageCache {
            // Make the slot ready
            let slot = &self.slots[slot_idx];
            inner.key = Some(cache_key.clone());
-            slot.set_usage_count(1);
+            slot.usage_count.store(1, Ordering::Relaxed);

            return Ok(WriteBufResult::NotFound(PageWriteGuard {
                inner,
@@ -801,8 +816,6 @@ impl PageCache {
    fn new(num_pages: usize) -> Self {
        assert!(num_pages > 0, "page cache size must be > 0");

-        // We use Box::leak here and into_boxed_slice to avoid leaking uninitialized
-        // memory that Vec's might contain.
        let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());

        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -422,53 +422,13 @@ impl Tenant {
            init_order,
            CreateTimelineCause::Load,
        )?;
-        let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
+        let new_disk_consistent_lsn = timeline.get_disk_consistent_lsn();
        anyhow::ensure!(
-            disk_consistent_lsn.is_valid(),
+            new_disk_consistent_lsn.is_valid(),
            "Timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
        );
-        assert_eq!(
-            disk_consistent_lsn,
-            up_to_date_metadata.disk_consistent_lsn(),
-            "these are used interchangeably"
-        );
-
-        // Save the metadata file to local disk.
-        if !picked_local {
-            save_metadata(
-                self.conf,
-                &tenant_id,
-                &timeline_id,
-                up_to_date_metadata,
-                first_save,
-            )
-            .context("save_metadata")?;
-        }
-
-        let index_part = remote_startup_data.as_ref().map(|x| &x.index_part);
-
-        if let Some(index_part) = index_part {
-            timeline
-                .remote_client
-                .as_ref()
-                .unwrap()
-                .init_upload_queue(index_part)?;
-        } else if self.remote_storage.is_some() {
-            // No data on the remote storage, but we have local metadata file. We can end up
-            // here with timeline_create being interrupted before finishing index part upload.
-            // By doing what we do here, the index part upload is retried.
-            // If control plane retries timeline creation in the meantime, the mgmt API handler
-            // for timeline creation will coalesce on the upload we queue here.
-            let rtc = timeline.remote_client.as_ref().unwrap();
-            rtc.init_upload_queue_for_empty_remote(up_to_date_metadata)?;
-            rtc.schedule_index_upload_for_metadata_update(up_to_date_metadata)?;
-        }
-
        timeline
-            .load_layer_map(
-                disk_consistent_lsn,
-                remote_startup_data.map(|x| x.index_part),
-            )
+            .load_layer_map(new_disk_consistent_lsn)
            .await
            .with_context(|| {
                format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}")
@@ -492,6 +452,19 @@ impl Tenant {
            }
        };

+        if self.remote_storage.is_some() {
+            // Reconcile local state with remote storage, downloading anything that's
+            // missing locally, and scheduling uploads for anything that's missing
+            // in remote storage.
+            timeline
+                .reconcile_with_remote(
+                    up_to_date_metadata,
+                    remote_startup_data.as_ref().map(|r| &r.index_part),
+                )
+                .await
+                .context("failed to reconcile with remote")?
+        }
+
        // Sanity check: a timeline should have some content.
        anyhow::ensure!(
            ancestor.is_some()
@@ -506,6 +479,18 @@ impl Tenant {
            "Timeline has no ancestor and no layer files"
        );

+        // Save the metadata file to local disk.
+        if !picked_local {
+            save_metadata(
+                self.conf,
+                &tenant_id,
+                &timeline_id,
+                up_to_date_metadata,
+                first_save,
+            )
+            .context("save_metadata")?;
+        }
+
        Ok(())
    }

@@ -698,7 +683,10 @@ impl Tenant {
            debug!("successfully downloaded index part for timeline {timeline_id}");
            match index_part {
                MaybeDeletedIndexPart::IndexPart(index_part) => {
-                    timeline_ancestors.insert(timeline_id, index_part.metadata.clone());
+                    timeline_ancestors.insert(
+                        timeline_id,
+                        index_part.parse_metadata().context("parse_metadata")?,
+                    );
                    remote_index_and_client.insert(timeline_id, (index_part, client));
                }
                MaybeDeletedIndexPart::Deleted(index_part) => {
@@ -749,7 +737,7 @@ impl Tenant {
            DeleteTimelineFlow::resume_deletion(
                Arc::clone(self),
                timeline_id,
-                &index_part.metadata,
+                &index_part.parse_metadata().context("parse_metadata")?,
                Some(remote_timeline_client),
                None,
            )
@@ -1311,7 +1299,10 @@ impl Tenant {
                        }
                    };

-                    let remote_metadata = index_part.metadata.clone();
+                    let remote_metadata = index_part
+                        .parse_metadata()
+                        .context("parse_metadata")
+                        .map_err(LoadLocalTimelineError::Load)?;
                    (
                        Some(RemoteStartupData {
                            index_part,
@@ -4101,7 +4092,7 @@ mod tests {
        let mut found_error_message = false;
        let mut err_source = err.source();
        while let Some(source) = err_source {
-            if source.to_string().contains("metadata checksum mismatch") {
+            if source.to_string() == "metadata checksum mismatch" {
                found_error_message = true;
                break;
            }
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -12,11 +12,14 @@
 //! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
 use crate::page_cache::PAGE_SZ;
-use crate::tenant::block_io::BlockCursor;
+use crate::tenant::block_io::{BlockCursor, BlockReader};
 use std::cmp::min;
 use std::io::{Error, ErrorKind};

-impl<'a> BlockCursor<'a> {
+impl<R> BlockCursor<R>
+where
+    R: BlockReader,
+{
    /// Read a blob into a new buffer.
    pub async fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
        let mut buf = Vec::new();
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -2,12 +2,8 @@
 //! Low-level Block-oriented I/O functions
 //!

-use super::ephemeral_file::EphemeralFile;
-use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
-use crate::virtual_file::VirtualFile;
 use bytes::Bytes;
-use std::fs::File;
 use std::ops::{Deref, DerefMut};
 use std::os::unix::fs::FileExt;

@@ -17,20 +13,32 @@ use std::os::unix::fs::FileExt;
 /// There are currently two implementations: EphemeralFile, and FileBlockReader
 /// below.
 pub trait BlockReader {
+    ///
+    /// Read a block. Returns a "lease" object that can be used to
+    /// access to the contents of the page. (For the page cache, the
+    /// lease object represents a lock on the buffer.)
+    ///
+    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error>;
+
    ///
    /// Create a new "cursor" for reading from this reader.
    ///
    /// A cursor caches the last accessed page, allowing for faster
    /// access if the same block is accessed repeatedly.
-    fn block_cursor(&self) -> BlockCursor<'_>;
+    fn block_cursor(&self) -> BlockCursor<&Self>
+    where
+        Self: Sized,
+    {
+        BlockCursor::new(self)
+    }
 }

 impl<B> BlockReader for &B
 where
    B: BlockReader,
 {
-    fn block_cursor(&self) -> BlockCursor<'_> {
-        (*self).block_cursor()
+    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+        (*self).read_blk(blknum)
    }
 }

@@ -68,34 +76,6 @@ impl<'a> Deref for BlockLease<'a> {
    }
 }

-/// Provides the ability to read blocks from different sources,
-/// similar to using traits for this purpose.
-///
-/// Unlike traits, we also support the read function to be async though.
-pub(crate) enum BlockReaderRef<'a> {
-    FileBlockReaderVirtual(&'a FileBlockReader<VirtualFile>),
-    FileBlockReaderFile(&'a FileBlockReader<std::fs::File>),
-    EphemeralFile(&'a EphemeralFile),
-    Adapter(Adapter<&'a DeltaLayerInner>),
-    #[cfg(test)]
-    TestDisk(&'a super::disk_btree::tests::TestDisk),
-}
-
-impl<'a> BlockReaderRef<'a> {
-    #[inline(always)]
-    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
-        use BlockReaderRef::*;
-        match self {
-            FileBlockReaderVirtual(r) => r.read_blk(blknum),
-            FileBlockReaderFile(r) => r.read_blk(blknum),
-            EphemeralFile(r) => r.read_blk(blknum),
-            Adapter(r) => r.read_blk(blknum),
-            #[cfg(test)]
-            TestDisk(r) => r.read_blk(blknum),
-        }
-    }
-}
-
 ///
 /// A "cursor" for efficiently reading multiple pages from a BlockReader
 ///
@@ -113,27 +93,21 @@ impl<'a> BlockReaderRef<'a> {
 /// // do stuff with 'buf'
 /// ```
 ///
-pub struct BlockCursor<'a> {
-    reader: BlockReaderRef<'a>,
+pub struct BlockCursor<R>
+where
+    R: BlockReader,
+{
+    reader: R,
 }

-impl<'a> BlockCursor<'a> {
-    pub(crate) fn new(reader: BlockReaderRef<'a>) -> Self {
+impl<R> BlockCursor<R>
+where
+    R: BlockReader,
+{
+    pub fn new(reader: R) -> Self {
        BlockCursor { reader }
    }
-    // Needed by cli
-    pub fn new_fileblockreader_virtual(reader: &'a FileBlockReader<VirtualFile>) -> Self {
-        BlockCursor {
-            reader: BlockReaderRef::FileBlockReaderVirtual(reader),
-        }
-    }

-    /// Read a block.
-    ///
-    /// Returns a "lease" object that can be used to
-    /// access to the contents of the page. (For the page cache, the
-    /// lease object represents a lock on the buffer.)
-    #[inline(always)]
    pub fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        self.reader.read_blk(blknum)
    }
@@ -165,12 +139,13 @@ where
        assert!(buf.len() == PAGE_SZ);
        self.file.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
    }
-    /// Read a block.
-    ///
-    /// Returns a "lease" object that can be used to
-    /// access to the contents of the page. (For the page cache, the
-    /// lease object represents a lock on the buffer.)
-    pub fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+}
+
+impl<F> BlockReader for FileBlockReader<F>
+where
+    F: FileExt,
+{
+    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        let cache = page_cache::get();
        loop {
            match cache
@@ -195,18 +170,6 @@ where
    }
 }

-impl BlockReader for FileBlockReader<File> {
-    fn block_cursor(&self) -> BlockCursor<'_> {
-        BlockCursor::new(BlockReaderRef::FileBlockReaderFile(self))
-    }
-}
-
-impl BlockReader for FileBlockReader<VirtualFile> {
-    fn block_cursor(&self) -> BlockCursor<'_> {
-        BlockCursor::new(BlockReaderRef::FileBlockReaderVirtual(self))
-    }
-}
-
 ///
 /// Trait for block-oriented output
 ///
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -7,7 +7,6 @@ use anyhow::Context;
 use pageserver_api::models::TenantState;
 use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use tokio::sync::OwnedMutexGuard;
-use tokio_util::sync::CancellationToken;
 use tracing::{error, info, instrument, warn, Instrument, Span};

 use utils::{
@@ -83,8 +82,6 @@ async fn create_remote_delete_mark(
        FAILED_UPLOAD_WARN_THRESHOLD,
        FAILED_REMOTE_OP_RETRIES,
        "mark_upload",
-        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-        backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
    )
    .await
    .context("mark_upload")?;
@@ -174,8 +171,6 @@ async fn remove_tenant_remote_delete_mark(
            FAILED_UPLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "remove_tenant_remote_delete_mark",
-            // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
        )
        .await
        .context("remove_tenant_remote_delete_mark")?;
@@ -257,8 +252,6 @@ pub(crate) async fn remote_delete_mark_exists(
        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
        "fetch_tenant_deletion_mark",
-        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-        backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
    )
    .await;

--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -259,10 +259,9 @@ where
    {
        let mut stack = Vec::new();
        stack.push((self.root_blk, None));
-        let block_cursor = self.reader.block_cursor();
        while let Some((node_blknum, opt_iter)) = stack.pop() {
            // Locate the node.
-            let node_buf = block_cursor.read_blk(self.start_blk + node_blknum)?;
+            let node_buf = self.reader.read_blk(self.start_blk + node_blknum)?;

            let node = OnDiskNode::deparse(node_buf.as_ref())?;
            let prefix_len = node.prefix_len as usize;
@@ -354,10 +353,8 @@ where

        stack.push((self.root_blk, String::new(), 0, 0, 0));

-        let block_cursor = self.reader.block_cursor();
-
        while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
-            let blk = block_cursor.read_blk(self.start_blk + blknum)?;
+            let blk = self.reader.read_blk(self.start_blk + blknum)?;
            let buf: &[u8] = blk.as_ref();
            let node = OnDiskNode::<L>::deparse(buf)?;

@@ -686,32 +683,29 @@ impl<const L: usize> BuildNode<L> {
 }

 #[cfg(test)]
-pub(crate) mod tests {
+mod tests {
    use super::*;
-    use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};
+    use crate::tenant::block_io::BlockLease;
    use rand::Rng;
    use std::collections::BTreeMap;
    use std::sync::atomic::{AtomicUsize, Ordering};

    #[derive(Clone, Default)]
-    pub(crate) struct TestDisk {
+    struct TestDisk {
        blocks: Vec<Bytes>,
    }
    impl TestDisk {
        fn new() -> Self {
            Self::default()
        }
-        pub(crate) fn read_blk(&self, blknum: u32) -> io::Result<BlockLease> {
+    }
+    impl BlockReader for TestDisk {
+        fn read_blk(&self, blknum: u32) -> io::Result<BlockLease> {
            let mut buf = [0u8; PAGE_SZ];
            buf.copy_from_slice(&self.blocks[blknum as usize]);
            Ok(std::rc::Rc::new(buf).into())
        }
    }
-    impl BlockReader for TestDisk {
-        fn block_cursor(&self) -> BlockCursor<'_> {
-            BlockCursor::new(BlockReaderRef::TestDisk(self))
-        }
-    }
    impl BlockWriter for &mut TestDisk {
        fn write_blk(&mut self, buf: Bytes) -> io::Result<u32> {
            let blknum = self.blocks.len();
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -3,7 +3,8 @@

 use crate::config::PageServerConf;
 use crate::page_cache::{self, PAGE_SZ};
-use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
+use crate::tenant::blob_io::BlobWriter;
+use crate::tenant::block_io::{BlockLease, BlockReader};
 use crate::virtual_file::VirtualFile;
 use std::cmp::min;
 use std::fs::OpenOptions;
@@ -21,7 +22,7 @@ pub struct EphemeralFile {
    _tenant_id: TenantId,
    _timeline_id: TimelineId,
    file: VirtualFile,
-    len: u64,
+    size: u64,
    /// An ephemeral file is append-only.
    /// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
    /// The other pages, which can no longer be modified, are accessed through the page cache.
@@ -52,56 +53,27 @@ impl EphemeralFile {
            _tenant_id: tenant_id,
            _timeline_id: timeline_id,
            file,
-            len: 0,
+            size: 0,
            mutable_tail: [0u8; PAGE_SZ],
        })
    }

-    pub(crate) fn len(&self) -> u64 {
-        self.len
+    pub(crate) fn size(&self) -> u64 {
+        self.size
    }
+}

-    pub(crate) fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
-        let flushed_blknums = 0..self.len / PAGE_SZ as u64;
-        if flushed_blknums.contains(&(blknum as u64)) {
-            let cache = page_cache::get();
-            loop {
-                match cache
-                    .read_immutable_buf(self.page_cache_file_id, blknum)
-                    .map_err(|e| {
-                        std::io::Error::new(
-                            std::io::ErrorKind::Other,
-                            // order path before error because error is anyhow::Error => might have many contexts
-                            format!(
-                                "ephemeral file: read immutable page #{}: {}: {:#}",
-                                blknum,
-                                self.file.path.display(),
-                                e,
-                            ),
-                        )
-                    })? {
-                    page_cache::ReadBufResult::Found(guard) => {
-                        return Ok(BlockLease::PageReadGuard(guard))
-                    }
-                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                        let buf: &mut [u8] = write_guard.deref_mut();
-                        debug_assert_eq!(buf.len(), PAGE_SZ);
-                        self.file
-                            .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)?;
-                        write_guard.mark_valid();
-
-                        // Swap for read lock
-                        continue;
-                    }
-                };
-            }
-        } else {
-            debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
-            Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
-        }
+/// Does the given filename look like an ephemeral file?
+pub fn is_ephemeral_file(filename: &str) -> bool {
+    if let Some(rest) = filename.strip_prefix("ephemeral-") {
+        rest.parse::<u32>().is_ok()
+    } else {
+        false
    }
+}

-    pub(crate) async fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
+impl BlobWriter for EphemeralFile {
+    fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
        struct Writer<'a> {
            ephemeral_file: &'a mut EphemeralFile,
            /// The block to which the next [`push_bytes`] will write.
@@ -112,13 +84,13 @@ impl EphemeralFile {
        impl<'a> Writer<'a> {
            fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
                Ok(Writer {
-                    blknum: (ephemeral_file.len / PAGE_SZ as u64) as u32,
-                    off: (ephemeral_file.len % PAGE_SZ as u64) as usize,
+                    blknum: (ephemeral_file.size / PAGE_SZ as u64) as u32,
+                    off: (ephemeral_file.size % PAGE_SZ as u64) as usize,
                    ephemeral_file,
                })
            }
            #[inline(always)]
-            async fn push_bytes(&mut self, src: &[u8]) -> Result<(), io::Error> {
+            fn push_bytes(&mut self, src: &[u8]) -> Result<(), io::Error> {
                let mut src_remaining = src;
                while !src_remaining.is_empty() {
                    let dst_remaining = &mut self.ephemeral_file.mutable_tail[self.off..];
@@ -182,47 +154,39 @@ impl EphemeralFile {
            }
        }

-        let pos = self.len;
+        let pos = self.size;
        let mut writer = Writer::new(self)?;

        // Write the length field
        if srcbuf.len() < 0x80 {
            // short one-byte length header
            let len_buf = [srcbuf.len() as u8];
-            writer.push_bytes(&len_buf).await?;
+            writer.push_bytes(&len_buf)?;
        } else {
            let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
            len_buf[0] |= 0x80;
-            writer.push_bytes(&len_buf).await?;
+            writer.push_bytes(&len_buf)?;
        }

        // Write the payload
-        writer.push_bytes(srcbuf).await?;
+        writer.push_bytes(srcbuf)?;

        if srcbuf.len() < 0x80 {
-            self.len += 1;
+            self.size += 1;
        } else {
-            self.len += 4;
+            self.size += 4;
        }
-        self.len += srcbuf.len() as u64;
+        self.size += srcbuf.len() as u64;

        Ok(pos)
    }
 }

-/// Does the given filename look like an ephemeral file?
-pub fn is_ephemeral_file(filename: &str) -> bool {
-    if let Some(rest) = filename.strip_prefix("ephemeral-") {
-        rest.parse::<u32>().is_ok()
-    } else {
-        false
-    }
-}
-
 impl Drop for EphemeralFile {
    fn drop(&mut self) {
-        // There might still be pages in the [`crate::page_cache`] for this file.
-        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
+        // drop all pages from page cache
+        let cache = page_cache::get();
+        cache.drop_buffers_for_immutable(self.page_cache_file_id);

        // unlink the file
        let res = std::fs::remove_file(&self.file.path);
@@ -243,15 +207,52 @@ impl Drop for EphemeralFile {
 }

 impl BlockReader for EphemeralFile {
-    fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
-        BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
+    fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
+        let flushed_blknums = 0..self.size / PAGE_SZ as u64;
+        if flushed_blknums.contains(&(blknum as u64)) {
+            let cache = page_cache::get();
+            loop {
+                match cache
+                    .read_immutable_buf(self.page_cache_file_id, blknum)
+                    .map_err(|e| {
+                        std::io::Error::new(
+                            std::io::ErrorKind::Other,
+                            // order path before error because error is anyhow::Error => might have many contexts
+                            format!(
+                                "ephemeral file: read immutable page #{}: {}: {:#}",
+                                blknum,
+                                self.file.path.display(),
+                                e,
+                            ),
+                        )
+                    })? {
+                    page_cache::ReadBufResult::Found(guard) => {
+                        return Ok(BlockLease::PageReadGuard(guard))
+                    }
+                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
+                        let buf: &mut [u8] = write_guard.deref_mut();
+                        debug_assert_eq!(buf.len(), PAGE_SZ);
+                        self.file
+                            .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)?;
+                        write_guard.mark_valid();
+
+                        // Swap for read lock
+                        continue;
+                    }
+                };
+            }
+        } else {
+            debug_assert_eq!(blknum as u64, self.size / PAGE_SZ as u64);
+            Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
+        }
    }
 }

 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::tenant::block_io::{BlockCursor, BlockReaderRef};
+    use crate::tenant::blob_io::BlobWriter;
+    use crate::tenant::block_io::BlockCursor;
    use rand::{thread_rng, RngCore};
    use std::fs;
    use std::str::FromStr;
@@ -279,12 +280,12 @@ mod tests {

        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;

-        let pos_foo = file.write_blob(b"foo").await?;
+        let pos_foo = file.write_blob(b"foo")?;
        assert_eq!(
            b"foo",
            file.block_cursor().read_blob(pos_foo).await?.as_slice()
        );
-        let pos_bar = file.write_blob(b"bar").await?;
+        let pos_bar = file.write_blob(b"bar")?;
        assert_eq!(
            b"foo",
            file.block_cursor().read_blob(pos_foo).await?.as_slice()
@@ -297,17 +298,17 @@ mod tests {
        let mut blobs = Vec::new();
        for i in 0..10000 {
            let data = Vec::from(format!("blob{}", i).as_bytes());
-            let pos = file.write_blob(&data).await?;
+            let pos = file.write_blob(&data)?;
            blobs.push((pos, data));
        }
        // also test with a large blobs
        for i in 0..100 {
            let data = format!("blob{}", i).as_bytes().repeat(100);
-            let pos = file.write_blob(&data).await?;
+            let pos = file.write_blob(&data)?;
            blobs.push((pos, data));
        }

-        let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file));
+        let cursor = BlockCursor::new(&file);
        for (pos, expected) in blobs {
            let actual = cursor.read_blob(pos).await?;
            assert_eq!(actual, expected);
@@ -317,7 +318,7 @@ mod tests {
        let mut large_data = Vec::new();
        large_data.resize(20000, 0);
        thread_rng().fill_bytes(&mut large_data);
-        let pos_large = file.write_blob(&large_data).await?;
+        let pos_large = file.write_blob(&large_data)?;
        let result = file.block_cursor().read_blob(pos_large).await?;
        assert_eq!(result, large_data);

--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -12,7 +12,7 @@ use std::fs::{File, OpenOptions};
 use std::io::{self, Write};

 use anyhow::{bail, ensure, Context};
-use serde::{de::Error, Deserialize, Serialize, Serializer};
+use serde::{Deserialize, Serialize};
 use thiserror::Error;
 use tracing::info_span;
 use utils::bin_ser::SerializeError;
@@ -232,28 +232,6 @@ impl TimelineMetadata {
    }
 }

-impl<'de> Deserialize<'de> for TimelineMetadata {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        let bytes = Vec::<u8>::deserialize(deserializer)?;
-        Self::from_bytes(bytes.as_slice()).map_err(|e| D::Error::custom(format!("{e}")))
-    }
-}
-
-impl Serialize for TimelineMetadata {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: Serializer,
-    {
-        let bytes = self
-            .to_bytes()
-            .map_err(|e| serde::ser::Error::custom(format!("{e}")))?;
-        bytes.serialize(serializer)
-    }
-}
-
 /// Save timeline metadata to file
 pub fn save_metadata(
    conf: &'static PageServerConf,
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -135,7 +135,7 @@
 //! - Initiate upload queue with that [`IndexPart`].
 //! - Reschedule all lost operations by comparing the local filesystem state
 //!   and remote state as per [`IndexPart`]. This is done in
-//!   [`Tenant::timeline_init_and_sync`].
+//!   [`Tenant::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
 //!
 //! Note that if we crash during file deletion between the index update
 //! that removes the file from the list of files, and deleting the remote file,
@@ -172,6 +172,7 @@
 //!   transitioning it from `TenantState::Attaching` to `TenantState::Active` state.
 //!   This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops.
 //!
+//! Most of the above steps happen in [`Timeline::reconcile_with_remote`] or its callers.
 //! We keep track of the fact that a client is in `Attaching` state in a marker
 //! file on the local disk. This is critical because, when we restart the pageserver,
 //! we do not want to do the `List timelines` step for each tenant that has already
@@ -191,14 +192,14 @@
 //! not created and the uploads are skipped.
 //! Theoretically, it should be ok to remove and re-add remote storage configuration to
 //! the pageserver config at any time, since it doesn't make a difference to
-//! [`Timeline::load_layer_map`].
+//! `reconcile_with_remote`.
 //! Of course, the remote timeline dir must not change while we have de-configured
 //! remote storage, i.e., the pageserver must remain the owner of the given prefix
 //! in remote storage.
 //! But note that we don't test any of this right now.
 //!
 //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
-//! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
+//! [`Timeline::reconcile_with_remote`]: super::Timeline::reconcile_with_remote

 mod delete;
 mod download;
@@ -210,7 +211,6 @@ use chrono::{NaiveDateTime, Utc};
 // re-export these
 pub use download::{is_temp_download_file, list_remote_timelines};
 use scopeguard::ScopeGuard;
-use tokio_util::sync::CancellationToken;
 use utils::backoff::{
    self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
@@ -231,7 +231,6 @@ use crate::metrics::{
    RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
    REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
 };
-use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::tenant::upload_queue::Delete;
@@ -354,10 +353,6 @@ impl RemoteTimelineClient {
        let mut upload_queue = self.upload_queue.lock().unwrap();
        upload_queue.initialize_with_current_remote_index_part(index_part)?;
        self.update_remote_physical_size_gauge(Some(index_part));
-        info!(
-            "initialized upload queue from remote index with {} layer files",
-            index_part.layer_metadata.len()
-        );
        Ok(())
    }

@@ -370,7 +365,6 @@ impl RemoteTimelineClient {
        let mut upload_queue = self.upload_queue.lock().unwrap();
        upload_queue.initialize_empty_remote(local_metadata)?;
        self.update_remote_physical_size_gauge(None);
-        info!("initialized upload queue as empty");
        Ok(())
    }

@@ -541,7 +535,8 @@ impl RemoteTimelineClient {
        // ahead of what's _actually_ on the remote during index upload.
        upload_queue.latest_metadata = metadata.clone();

-        self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
+        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
+        self.schedule_index_upload(upload_queue, metadata_bytes);

        Ok(())
    }
@@ -561,7 +556,8 @@ impl RemoteTimelineClient {
        let upload_queue = guard.initialized_mut()?;

        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
+            let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
+            self.schedule_index_upload(upload_queue, metadata_bytes);
        }

        Ok(())
@@ -571,7 +567,7 @@ impl RemoteTimelineClient {
    fn schedule_index_upload(
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
-        metadata: TimelineMetadata,
+        metadata_bytes: Vec<u8>,
    ) {
        info!(
            "scheduling metadata upload with {} files ({} changed)",
@@ -584,7 +580,7 @@ impl RemoteTimelineClient {
        let index_part = IndexPart::new(
            upload_queue.latest_files.clone(),
            disk_consistent_lsn,
-            metadata,
+            metadata_bytes,
        );
        let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
        self.calls_unfinished_metric_begin(&op);
@@ -640,7 +636,7 @@ impl RemoteTimelineClient {

        // Deleting layers doesn't affect the values stored in TimelineMetadata,
        // so we don't need update it. Just serialize it.
-        let metadata = upload_queue.latest_metadata.clone();
+        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;

        // Update the remote index file, removing the to-be-deleted files from the index,
        // before deleting the actual files.
@@ -651,13 +647,12 @@ impl RemoteTimelineClient {
        // to syntactically forbid ? or bail! calls here.
        let no_bail_here = || {
            for name in names {
-                if upload_queue.latest_files.remove(name).is_some() {
-                    upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
-                }
+                upload_queue.latest_files.remove(name);
+                upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
            }

            if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-                self.schedule_index_upload(upload_queue, metadata);
+                self.schedule_index_upload(upload_queue, metadata_bytes);
            }

            // schedule the actual deletions
@@ -759,7 +754,7 @@ impl RemoteTimelineClient {
        pausable_failpoint!("persist_deleted_index_part");

        backoff::retry(
-            || {
+            || async {
                upload::upload_index_part(
                    self.conf,
                    &self.storage_impl,
@@ -767,6 +762,7 @@ impl RemoteTimelineClient {
                    &self.timeline_id,
                    &index_part_with_deleted_at,
                )
+                .await
            },
            |_e| false,
            1,
@@ -775,8 +771,6 @@ impl RemoteTimelineClient {
            // when executed as part of tenant deletion this happens in the background
            2,
            "persist_index_part_with_deleted_flag",
-            // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
        )
        .await?;

@@ -863,7 +857,6 @@ impl RemoteTimelineClient {
            FAILED_DOWNLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "list_prefixes",
-            backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
        )
        .await
        .context("list prefixes")?;
@@ -887,7 +880,6 @@ impl RemoteTimelineClient {
                FAILED_UPLOAD_WARN_THRESHOLD,
                FAILED_REMOTE_OP_RETRIES,
                "delete_objects",
-                backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
            )
            .await
            .context("delete_objects")?;
@@ -909,7 +901,6 @@ impl RemoteTimelineClient {
            FAILED_UPLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "delete_index",
-            backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled")),
        )
        .await
        .context("delete_index")?;
@@ -1075,15 +1066,6 @@ impl RemoteTimelineClient {
                    .await
                }
                UploadOp::UploadMetadata(ref index_part, _lsn) => {
-                    let mention_having_future_layers = if cfg!(feature = "testing") {
-                        index_part
-                            .layer_metadata
-                            .keys()
-                            .any(|x| x.is_in_future(*_lsn))
-                    } else {
-                        false
-                    };
-
                    let res = upload::upload_index_part(
                        self.conf,
                        &self.storage_impl,
@@ -1101,10 +1083,6 @@ impl RemoteTimelineClient {
                    .await;
                    if res.is_ok() {
                        self.update_remote_physical_size_gauge(Some(index_part));
-                        if mention_having_future_layers {
-                            // find rationale near crate::tenant::timeline::init::cleanup_future_layer
-                            tracing::info!(disk_consistent_lsn=%_lsn, "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup");
-                        }
                    }
                    res
                }
@@ -1156,13 +1134,14 @@ impl RemoteTimelineClient {
                    }

                    // sleep until it's time to retry, or we're cancelled
-                    exponential_backoff(
-                        retries,
-                        DEFAULT_BASE_BACKOFF_SECONDS,
-                        DEFAULT_MAX_BACKOFF_SECONDS,
-                        &shutdown_token(),
-                    )
-                    .await;
+                    tokio::select! {
+                        _ = task_mgr::shutdown_watcher() => { },
+                        _ = exponential_backoff(
+                            retries,
+                            DEFAULT_BASE_BACKOFF_SECONDS,
+                            DEFAULT_MAX_BACKOFF_SECONDS,
+                        ) => { },
+                    };
                }
            }
        }
@@ -1609,7 +1588,8 @@ mod tests {
                &layer_file_name_2.file_name(),
            ],
        );
-        assert_eq!(index_part.metadata, metadata);
+        let downloaded_metadata = index_part.parse_metadata().unwrap();
+        assert_eq!(downloaded_metadata, metadata);

        // Schedule upload and then a deletion. Check that the deletion is queued
        client
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -11,7 +11,6 @@ use std::time::Duration;
 use anyhow::{anyhow, Context};
 use tokio::fs;
 use tokio::io::AsyncWriteExt;
-use tokio_util::sync::CancellationToken;
 use utils::{backoff, crashsafe};

 use crate::config::PageServerConf;
@@ -281,10 +280,6 @@ where
        FAILED_DOWNLOAD_WARN_THRESHOLD,
        FAILED_REMOTE_OP_RETRIES,
        description,
-        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-        backoff::Cancel::new(CancellationToken::new(), || -> DownloadError {
-            unreachable!()
-        }),
    )
    .await
 }
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -77,9 +77,7 @@ pub struct IndexPart {
    // private because internally we would read from metadata instead.
    #[serde_as(as = "DisplayFromStr")]
    disk_consistent_lsn: Lsn,
-
-    #[serde(rename = "metadata_bytes")]
-    pub metadata: TimelineMetadata,
+    metadata_bytes: Vec<u8>,
 }

 impl IndexPart {
@@ -97,7 +95,7 @@ impl IndexPart {
    pub fn new(
        layers_and_metadata: HashMap<LayerFileName, LayerFileMetadata>,
        disk_consistent_lsn: Lsn,
-        metadata: TimelineMetadata,
+        metadata_bytes: Vec<u8>,
    ) -> Self {
        let mut timeline_layers = HashSet::with_capacity(layers_and_metadata.len());
        let mut layer_metadata = HashMap::with_capacity(layers_and_metadata.len());
@@ -113,10 +111,14 @@ impl IndexPart {
            timeline_layers,
            layer_metadata,
            disk_consistent_lsn,
-            metadata,
+            metadata_bytes,
            deleted_at: None,
        }
    }
+
+    pub fn parse_metadata(&self) -> anyhow::Result<TimelineMetadata> {
+        TimelineMetadata::from_bytes(&self.metadata_bytes)
+    }
 }

 impl TryFrom<&UploadQueueInitialized> for IndexPart {
@@ -124,12 +126,12 @@ impl TryFrom<&UploadQueueInitialized> for IndexPart {

    fn try_from(upload_queue: &UploadQueueInitialized) -> Result<Self, Self::Error> {
        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
-        let metadata = upload_queue.latest_metadata.clone();
+        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;

        Ok(Self::new(
            upload_queue.latest_files.clone(),
            disk_consistent_lsn,
-            metadata,
+            metadata_bytes,
        ))
    }
 }
@@ -180,7 +182,7 @@ mod tests {
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
+            metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
            deleted_at: None,
        };

@@ -199,7 +201,7 @@ mod tests {
                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
            },
            "disk_consistent_lsn":"0/16960E8",
-            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+            "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
        }"#;

        let expected = IndexPart {
@@ -217,7 +219,7 @@ mod tests {
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
+            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
            deleted_at: None,
        };

@@ -236,7 +238,7 @@ mod tests {
                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
            },
            "disk_consistent_lsn":"0/16960E8",
-            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+            "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
            "deleted_at": "2023-07-31T09:00:00.123"
        }"#;

@@ -255,7 +257,7 @@ mod tests {
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
+            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
        };
@@ -279,7 +281,7 @@ mod tests {
            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::new(),
            disk_consistent_lsn: "0/2532648".parse::<Lsn>().unwrap(),
-            metadata: TimelineMetadata::from_bytes(&[
+            metadata_bytes: [
                136, 151, 49, 208, 0, 70, 0, 4, 0, 0, 0, 0, 2, 83, 38, 72, 1, 0, 0, 0, 0, 2, 83,
                38, 32, 1, 87, 198, 240, 135, 97, 119, 45, 125, 38, 29, 155, 161, 140, 141, 255,
                210, 0, 0, 0, 0, 2, 83, 38, 72, 0, 0, 0, 0, 1, 73, 240, 192, 0, 0, 0, 0, 1, 73,
@@ -300,8 +302,8 @@ mod tests {
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0,
-            ])
-            .unwrap(),
+            ]
+            .to_vec(),
            deleted_at: None,
        };

--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -41,6 +41,8 @@ pub use inmemory_layer::InMemoryLayer;
 pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
 pub use remote_layer::RemoteLayer;

+use super::timeline::layer_manager::LayerManager;
+
 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
    T: PartialOrd<T>,
@@ -173,9 +175,16 @@ impl LayerAccessStats {
    ///
    /// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad
    /// [`record_residence_event`]: Self::record_residence_event
-    pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
+    pub(crate) fn for_loading_layer(
+        layer_map_lock_held_witness: &LayerManager,
+        status: LayerResidenceStatus,
+    ) -> Self {
        let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
-        new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
+        new.record_residence_event(
+            layer_map_lock_held_witness,
+            status,
+            LayerResidenceEventReason::LayerLoad,
+        );
        new
    }

@@ -188,6 +197,7 @@ impl LayerAccessStats {
    /// [`record_residence_event`]: Self::record_residence_event
    pub(crate) fn clone_for_residence_change(
        &self,
+        layer_map_lock_held_witness: &LayerManager,
        new_status: LayerResidenceStatus,
    ) -> LayerAccessStats {
        let clone = {
@@ -195,7 +205,11 @@ impl LayerAccessStats {
            inner.clone()
        };
        let new = LayerAccessStats(Mutex::new(clone));
-        new.record_residence_event(new_status, LayerResidenceEventReason::ResidenceChange);
+        new.record_residence_event(
+            layer_map_lock_held_witness,
+            new_status,
+            LayerResidenceEventReason::ResidenceChange,
+        );
        new
    }

@@ -215,6 +229,7 @@ impl LayerAccessStats {
    ///
    pub(crate) fn record_residence_event(
        &self,
+        _layer_map_lock_held_witness: &LayerManager,
        status: LayerResidenceStatus,
        reason: LayerResidenceEventReason,
    ) {
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -318,28 +318,30 @@ impl DeltaLayer {

        tree_reader.dump().await?;

-        let keys = DeltaLayerInner::load_keys(&inner).await?;
+        let keys = DeltaLayerInner::load_keys(&Ref(&**inner)).await?;

        // A subroutine to dump a single blob
-        async fn dump_blob(val: ValueRef<'_>) -> Result<String> {
-            let buf = val.reader.read_blob(val.blob_ref.pos()).await?;
-            let val = Value::des(&buf)?;
-            let desc = match val {
-                Value::Image(img) => {
-                    format!(" img {} bytes", img.len())
-                }
-                Value::WalRecord(rec) => {
-                    let wal_desc = walrecord::describe_wal_record(&rec)?;
-                    format!(
-                        " rec {} bytes will_init: {} {}",
-                        buf.len(),
-                        rec.will_init(),
-                        wal_desc
-                    )
-                }
-            };
-            Ok(desc)
-        }
+        let dump_blob = |val: ValueRef<_>| -> _ {
+            async move {
+                let buf = val.reader.read_blob(val.blob_ref.pos()).await?;
+                let val = Value::des(&buf)?;
+                let desc = match val {
+                    Value::Image(img) => {
+                        format!(" img {} bytes", img.len())
+                    }
+                    Value::WalRecord(rec) => {
+                        let wal_desc = walrecord::describe_wal_record(&rec)?;
+                        format!(
+                            " rec {} bytes will_init: {} {}",
+                            buf.len(),
+                            rec.will_init(),
+                            wal_desc
+                        )
+                    }
+                };
+                Ok(desc)
+            }
+        };

        for entry in keys {
            let DeltaEntry { key, lsn, val, .. } = entry;
@@ -550,12 +552,17 @@ impl DeltaLayer {
    /// Loads all keys stored in the layer. Returns key, lsn, value size and value reference.
    ///
    /// The value can be obtained via the [`ValueRef::load`] function.
-    pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<DeltaEntry<'_>>> {
+    pub(crate) async fn load_keys(
+        &self,
+        ctx: &RequestContext,
+    ) -> Result<Vec<DeltaEntry<Ref<&'_ DeltaLayerInner>>>> {
        let inner = self
            .load(LayerAccessKind::KeyIter, ctx)
            .await
            .context("load delta layer keys")?;
-        DeltaLayerInner::load_keys(inner)
+
+        let inner = Ref(&**inner);
+        DeltaLayerInner::load_keys(&inner)
            .await
            .context("Layer index is corrupted")
    }
@@ -951,14 +958,14 @@ impl DeltaLayerInner {

    pub(super) async fn load_keys<T: AsRef<DeltaLayerInner> + Clone>(
        this: &T,
-    ) -> Result<Vec<DeltaEntry<'_>>> {
+    ) -> Result<Vec<DeltaEntry<T>>> {
        let dl = this.as_ref();
        let file = &dl.file;

        let tree_reader =
            DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);

-        let mut all_keys: Vec<DeltaEntry<'_>> = Vec::new();
+        let mut all_keys: Vec<DeltaEntry<T>> = Vec::new();

        tree_reader
            .visit(
@@ -968,9 +975,7 @@ impl DeltaLayerInner {
                    let delta_key = DeltaKey::from_slice(key);
                    let val_ref = ValueRef {
                        blob_ref: BlobRef(value),
-                        reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
-                            Adapter(dl),
-                        )),
+                        reader: BlockCursor::new(Adapter(this.clone())),
                    };
                    let pos = BlobRef(value).pos();
                    if let Some(last) = all_keys.last_mut() {
@@ -999,23 +1004,43 @@ impl DeltaLayerInner {
    }
 }

+/// Cloneable borrow wrapper to make borrows behave like smart pointers.
+///
+/// Shared references are trivially copyable. This wrapper avoids (confusion) to otherwise attempt
+/// cloning DeltaLayerInner.
+pub(crate) struct Ref<T>(T);
+
+impl<'a, T> AsRef<T> for Ref<&'a T> {
+    fn as_ref(&self) -> &T {
+        self.0
+    }
+}
+
+impl<'a, T> Clone for Ref<&'a T> {
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+impl<'a, T> Copy for Ref<&'a T> {}
+
 /// A set of data associated with a delta layer key and its value
-pub struct DeltaEntry<'a> {
+pub struct DeltaEntry<T: AsRef<DeltaLayerInner>> {
    pub key: Key,
    pub lsn: Lsn,
    /// Size of the stored value
    pub size: u64,
    /// Reference to the on-disk value
-    pub val: ValueRef<'a>,
+    pub val: ValueRef<T>,
 }

 /// Reference to an on-disk value
-pub struct ValueRef<'a> {
+pub struct ValueRef<T: AsRef<DeltaLayerInner>> {
    blob_ref: BlobRef,
-    reader: BlockCursor<'a>,
+    reader: BlockCursor<Adapter<T>>,
 }

-impl<'a> ValueRef<'a> {
+impl<T: AsRef<DeltaLayerInner>> ValueRef<T> {
    /// Loads the value from disk
    pub async fn load(&self) -> Result<Value> {
        // theoretically we *could* record an access time for each, but it does not really matter
@@ -1025,10 +1050,10 @@ impl<'a> ValueRef<'a> {
    }
 }

-pub(crate) struct Adapter<T>(T);
+struct Adapter<T: AsRef<DeltaLayerInner>>(T);

-impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
-    pub(crate) fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+impl<T: AsRef<DeltaLayerInner>> BlockReader for Adapter<T> {
+    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        self.0.as_ref().file.read_blk(blknum)
    }
 }
--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
@@ -212,20 +212,9 @@ pub enum LayerFileName {
 }

 impl LayerFileName {
-    pub(crate) fn file_name(&self) -> String {
+    pub fn file_name(&self) -> String {
        self.to_string()
    }
-
-    /// Determines if this layer file is considered to be in future meaning we will discard these
-    /// layers during timeline initialization from the given disk_consistent_lsn.
-    pub(crate) fn is_in_future(&self, disk_consistent_lsn: Lsn) -> bool {
-        use LayerFileName::*;
-        match self {
-            Image(file_name) if file_name.lsn > disk_consistent_lsn => true,
-            Delta(file_name) if file_name.lsn_range.end > disk_consistent_lsn + 1 => true,
-            _ => false,
-        }
-    }
 }

 impl fmt::Display for LayerFileName {
@@ -274,8 +263,8 @@ impl serde::Serialize for LayerFileName {
        S: serde::Serializer,
    {
        match self {
-            Self::Image(fname) => serializer.collect_str(fname),
-            Self::Delta(fname) => serializer.collect_str(fname),
+            Self::Image(fname) => serializer.serialize_str(&fname.to_string()),
+            Self::Delta(fname) => serializer.serialize_str(&fname.to_string()),
        }
    }
 }
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -7,12 +7,14 @@
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::repository::{Key, Value};
+use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
 use crate::walrecord;
 use anyhow::{ensure, Result};
 use pageserver_api::models::InMemoryLayerInfo;
+use std::cell::RefCell;
 use std::collections::HashMap;
 use std::sync::OnceLock;
 use tracing::*;
@@ -30,6 +32,12 @@ use tokio::sync::RwLock;

 use super::{DeltaLayer, DeltaLayerWriter, Layer};

+thread_local! {
+    /// A buffer for serializing object during [`InMemoryLayer::put_value`].
+    /// This buffer is reused for each serialization to avoid additional malloc calls.
+    static SER_BUFFER: RefCell<Vec<u8>> = RefCell::new(Vec::new());
+}
+
 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
    tenant_id: TenantId,
@@ -230,7 +238,7 @@ impl InMemoryLayer {
    ///
    pub async fn size(&self) -> Result<u64> {
        let inner = self.inner.read().await;
-        Ok(inner.file.len())
+        Ok(inner.file.size())
    }

    ///
@@ -265,17 +273,17 @@ impl InMemoryLayer {
    /// Adds the page version to the in-memory tree
    pub async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
-        let inner: &mut _ = &mut *self.inner.write().await;
+        let mut inner = self.inner.write().await;
        self.assert_writable();

        let off = {
-            // Avoid doing allocations for "small" values.
-            // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
-            // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
-            let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
-            buf.clear();
-            val.ser_into(&mut buf)?;
-            inner.file.write_blob(&buf).await?
+            SER_BUFFER.with(|x| -> Result<_> {
+                let mut buf = x.borrow_mut();
+                buf.clear();
+                val.ser_into(&mut (*buf))?;
+                let off = inner.file.write_blob(&buf)?;
+                Ok(off)
+            })?
        };

        let vec_map = inner.index.entry(key).or_default();
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -185,7 +185,7 @@ impl RemoteLayer {
    /// Create a Layer struct representing this layer, after it has been downloaded.
    pub(crate) fn create_downloaded_layer(
        &self,
-        _layer_map_lock_held_witness: &LayerManager,
+        layer_map_lock_held_witness: &LayerManager,
        conf: &'static PageServerConf,
        file_size: u64,
    ) -> Arc<dyn PersistentLayer> {
@@ -197,8 +197,10 @@ impl RemoteLayer {
                self.desc.tenant_id,
                &fname,
                file_size,
-                self.access_stats
-                    .clone_for_residence_change(LayerResidenceStatus::Resident),
+                self.access_stats.clone_for_residence_change(
+                    layer_map_lock_held_witness,
+                    LayerResidenceStatus::Resident,
+                ),
            ))
        } else {
            let fname = self.desc.image_file_name();
@@ -208,8 +210,10 @@ impl RemoteLayer {
                self.desc.tenant_id,
                &fname,
                file_size,
-                self.access_stats
-                    .clone_for_residence_change(LayerResidenceStatus::Resident),
+                self.access_stats.clone_for_residence_change(
+                    layer_map_lock_held_witness,
+                    LayerResidenceStatus::Resident,
+                ),
            ))
        }
    }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,6 +1,5 @@
 pub mod delete;
 mod eviction_task;
-mod init;
 pub mod layer_manager;
 mod logical_size;
 pub mod span;
@@ -28,6 +27,7 @@ use utils::id::TenantTimelineId;

 use std::cmp::{max, min, Ordering};
 use std::collections::{BinaryHeap, HashMap, HashSet};
+use std::fs;
 use std::ops::{Deref, Range};
 use std::path::{Path, PathBuf};
 use std::pin::pin;
@@ -38,13 +38,15 @@ use std::time::{Duration, Instant, SystemTime};
 use crate::context::{
    AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
 };
-use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
 use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
-    DeltaLayerWriter, ImageLayerWriter, InMemoryLayer, LayerAccessStats, LayerFileName, RemoteLayer,
+    DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer,
+    LayerAccessStats, LayerFileName, RemoteLayer,
 };
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
+    ephemeral_file::is_ephemeral_file,
    layer_map::{LayerMap, SearchResult},
    metadata::{save_metadata, TimelineMetadata},
    par_fsync,
@@ -76,10 +78,11 @@ use utils::{
 use crate::page_cache;
 use crate::repository::GcResult;
 use crate::repository::{Key, Value};
-use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::walredo::WalRedoManager;
+use crate::METADATA_FILE_NAME;
 use crate::ZERO_PAGE;
+use crate::{is_temporary, task_mgr};

 use self::delete::DeleteTimelineFlow;
 pub(super) use self::eviction_task::EvictionTaskTenantState;
@@ -1208,7 +1211,7 @@ impl Timeline {
                &layer_metadata,
                local_layer
                    .access_stats()
-                    .clone_for_residence_change(LayerResidenceStatus::Evicted),
+                    .clone_for_residence_change(layer_mgr, LayerResidenceStatus::Evicted),
            ),
            LayerFileName::Delta(delta_name) => RemoteLayer::new_delta(
                self.tenant_id,
@@ -1217,7 +1220,7 @@ impl Timeline {
                &layer_metadata,
                local_layer
                    .access_stats()
-                    .clone_for_residence_change(LayerResidenceStatus::Evicted),
+                    .clone_for_residence_change(layer_mgr, LayerResidenceStatus::Evicted),
            ),
        });

@@ -1515,7 +1518,7 @@ impl Timeline {
        let layer_flush_start_rx = self.layer_flush_start_tx.subscribe();
        let self_clone = Arc::clone(self);

-        debug!("spawning flush loop");
+        info!("spawning flush loop");
        *flush_loop_state = FlushLoopState::Running {
            #[cfg(test)]
            expect_initdb_optimization: false,
@@ -1586,7 +1589,9 @@ impl Timeline {
        ));
    }

+    ///
    /// Initialize with an empty layer map. Used when creating a new timeline.
+    ///
    pub(super) fn init_empty_layer_map(&self, start_lsn: Lsn) {
        let mut layers = self.layers.try_write().expect(
            "in the context where we call this function, no other task has access to the object",
@@ -1594,16 +1599,10 @@ impl Timeline {
        layers.initialize_empty(Lsn(start_lsn.0));
    }

-    /// Scan the timeline directory, cleanup, populate the layer map, and schedule uploads for local-only
-    /// files.
-    pub(super) async fn load_layer_map(
-        &self,
-        disk_consistent_lsn: Lsn,
-        index_part: Option<IndexPart>,
-    ) -> anyhow::Result<()> {
-        use init::{Decision::*, Discovered, FutureLayer};
-        use LayerFileName::*;
-
+    ///
+    /// Scan the timeline directory to populate the layer map.
+    ///
+    pub(super) async fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
        let mut guard = self.layers.write().await;

        let timer = self.metrics.load_layer_map_histo.start_timer();
@@ -1611,153 +1610,102 @@ impl Timeline {
        // Scan timeline directory and create ImageFileName and DeltaFilename
        // structs representing all files on disk
        let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
-        let (conf, tenant_id, timeline_id) = (self.conf, self.tenant_id, self.timeline_id);
-        let span = tracing::Span::current();
+        // total size of layer files in the current timeline directory
+        let mut total_physical_size = 0;

-        let (loaded_layers, to_sync, total_physical_size) = tokio::task::spawn_blocking({
-            move || {
-                let _g = span.entered();
-                let discovered = init::scan_timeline_dir(&timeline_path)?;
-                let mut discovered_layers = Vec::with_capacity(discovered.len());
-                let mut unrecognized_files = Vec::new();
+        let mut loaded_layers = Vec::<Arc<dyn PersistentLayer>>::new();

-                let mut path = timeline_path;
+        for direntry in fs::read_dir(timeline_path)? {
+            let direntry = direntry?;
+            let direntry_path = direntry.path();
+            let fname = direntry.file_name();
+            let fname = fname.to_string_lossy();

-                for discovered in discovered {
-                    let (name, kind) = match discovered {
-                        Discovered::Layer(file_name, file_size) => {
-                            discovered_layers.push((file_name, file_size));
-                            continue;
-                        }
-                        Discovered::Metadata | Discovered::IgnoredBackup => {
-                            continue;
-                        }
-                        Discovered::Unknown(file_name) => {
-                            // we will later error if there are any
-                            unrecognized_files.push(file_name);
-                            continue;
-                        }
-                        Discovered::Ephemeral(name) => (name, "old ephemeral file"),
-                        Discovered::Temporary(name) => (name, "temporary timeline file"),
-                        Discovered::TemporaryDownload(name) => (name, "temporary download"),
-                    };
-                    path.push(name);
-                    init::cleanup(&path, kind)?;
-                    path.pop();
-                }
-
-                if !unrecognized_files.is_empty() {
-                    // assume that if there are any there are many many.
-                    let n = unrecognized_files.len();
-                    let first = &unrecognized_files[..n.min(10)];
-                    anyhow::bail!(
-                        "unrecognized files in timeline dir (total {n}), first 10: {first:?}"
+            if let Some(filename) = ImageFileName::parse_str(&fname) {
+                // create an ImageLayer struct for each image file.
+                if filename.lsn > disk_consistent_lsn {
+                    info!(
+                        "found future image layer {} on timeline {} disk_consistent_lsn is {}",
+                        filename, self.timeline_id, disk_consistent_lsn
                    );
+
+                    rename_to_backup(&direntry_path)?;
+                    continue;
                }

-                let decided =
-                    init::reconcile(discovered_layers, index_part.as_ref(), disk_consistent_lsn);
+                let file_size = direntry_path.metadata()?.len();
+                let stats =
+                    LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident);

-                let mut loaded_layers = Vec::new();
-                let mut needs_upload = Vec::new();
-                let mut needs_cleanup = Vec::new();
-                let mut total_physical_size = 0;
+                let layer = ImageLayer::new(
+                    self.conf,
+                    self.timeline_id,
+                    self.tenant_id,
+                    &filename,
+                    file_size,
+                    stats,
+                );

-                for (name, decision) in decided {
-                    let decision = match decision {
-                        Ok(UseRemote { local, remote }) => {
-                            path.push(name.file_name());
-                            init::cleanup_local_file_for_remote(&path, &local, &remote)?;
-                            path.pop();
+                total_physical_size += file_size;
+                loaded_layers.push(Arc::new(layer));
+            } else if let Some(filename) = DeltaFileName::parse_str(&fname) {
+                // Create a DeltaLayer struct for each delta file.
+                // The end-LSN is exclusive, while disk_consistent_lsn is
+                // inclusive. For example, if disk_consistent_lsn is 100, it is
+                // OK for a delta layer to have end LSN 101, but if the end LSN
+                // is 102, then it might not have been fully flushed to disk
+                // before crash.
+                if filename.lsn_range.end > disk_consistent_lsn + 1 {
+                    info!(
+                        "found future delta layer {} on timeline {} disk_consistent_lsn is {}",
+                        filename, self.timeline_id, disk_consistent_lsn
+                    );

-                            UseRemote { local, remote }
-                        }
-                        Ok(decision) => decision,
-                        Err(FutureLayer { local }) => {
-                            if local.is_some() {
-                                path.push(name.file_name());
-                                init::cleanup_future_layer(&path, &name, disk_consistent_lsn)?;
-                                path.pop();
-                            }
-                            needs_cleanup.push(name);
-                            continue;
-                        }
-                    };
-
-                    match &name {
-                        Delta(d) => assert!(d.lsn_range.end <= disk_consistent_lsn + 1),
-                        Image(i) => assert!(i.lsn <= disk_consistent_lsn),
-                    }
-
-                    let status = match &decision {
-                        UseLocal(_) | NeedsUpload(_) => LayerResidenceStatus::Resident,
-                        Evicted(_) | UseRemote { .. } => LayerResidenceStatus::Evicted,
-                    };
-
-                    let stats = LayerAccessStats::for_loading_layer(status);
-
-                    let layer: Arc<dyn PersistentLayer> = match (name, &decision) {
-                        (Delta(d), UseLocal(m) | NeedsUpload(m)) => {
-                            total_physical_size += m.file_size();
-                            Arc::new(DeltaLayer::new(
-                                conf,
-                                timeline_id,
-                                tenant_id,
-                                &d,
-                                m.file_size(),
-                                stats,
-                            ))
-                        }
-                        (Image(i), UseLocal(m) | NeedsUpload(m)) => {
-                            total_physical_size += m.file_size();
-                            Arc::new(ImageLayer::new(
-                                conf,
-                                timeline_id,
-                                tenant_id,
-                                &i,
-                                m.file_size(),
-                                stats,
-                            ))
-                        }
-                        (Delta(d), Evicted(remote) | UseRemote { remote, .. }) => Arc::new(
-                            RemoteLayer::new_delta(tenant_id, timeline_id, &d, remote, stats),
-                        ),
-                        (Image(i), Evicted(remote) | UseRemote { remote, .. }) => Arc::new(
-                            RemoteLayer::new_img(tenant_id, timeline_id, &i, remote, stats),
-                        ),
-                    };
-
-                    if let NeedsUpload(m) = decision {
-                        needs_upload.push((layer.clone(), m));
-                    }
-
-                    loaded_layers.push(layer);
+                    rename_to_backup(&direntry_path)?;
+                    continue;
                }
-                Ok((
-                    loaded_layers,
-                    (needs_upload, needs_cleanup),
-                    total_physical_size,
-                ))
+
+                let file_size = direntry_path.metadata()?.len();
+                let stats =
+                    LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident);
+
+                let layer = DeltaLayer::new(
+                    self.conf,
+                    self.timeline_id,
+                    self.tenant_id,
+                    &filename,
+                    file_size,
+                    stats,
+                );
+
+                total_physical_size += file_size;
+                loaded_layers.push(Arc::new(layer));
+            } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
+                // ignore these
+            } else if remote_timeline_client::is_temp_download_file(&direntry_path) {
+                info!(
+                    "skipping temp download file, reconcile_with_remote will resume / clean up: {}",
+                    fname
+                );
+            } else if is_ephemeral_file(&fname) {
+                // Delete any old ephemeral files
+                trace!("deleting old ephemeral file in timeline dir: {}", fname);
+                fs::remove_file(&direntry_path)?;
+            } else if is_temporary(&direntry_path) {
+                info!("removing temp timeline file at {}", direntry_path.display());
+                fs::remove_file(&direntry_path).with_context(|| {
+                    format!(
+                        "failed to remove temp download file at {}",
+                        direntry_path.display()
+                    )
+                })?;
+            } else {
+                warn!("unrecognized filename in timeline dir: {}", fname);
            }
-        })
-        .await
-        .map_err(anyhow::Error::new)
-        .and_then(|x| x)?;
+        }

        let num_layers = loaded_layers.len();
-
-        guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1);
-
-        if let Some(rtc) = self.remote_client.as_ref() {
-            let (needs_upload, needs_cleanup) = to_sync;
-            for (layer, m) in needs_upload {
-                rtc.schedule_layer_file_upload(&layer.layer_desc().filename(), &m)?;
-            }
-            rtc.schedule_layer_file_deletion(&needs_cleanup)?;
-            rtc.schedule_index_upload_for_file_changes()?;
-            // Tenant::create_timeline will wait for these uploads to happen before returning, or
-            // on retry.
-        }
+        guard.initialize_local_layers(loaded_layers, Lsn(disk_consistent_lsn.0) + 1);

        info!(
            "loaded layer map with {} layers at {}, total physical size: {}",
@@ -1768,6 +1716,236 @@ impl Timeline {
            .set(total_physical_size);

        timer.stop_and_record();
+
+        Ok(())
+    }
+
+    async fn create_remote_layers(
+        &self,
+        index_part: &IndexPart,
+        local_layers: HashMap<LayerFileName, Arc<dyn PersistentLayer>>,
+        up_to_date_disk_consistent_lsn: Lsn,
+    ) -> anyhow::Result<HashMap<LayerFileName, Arc<dyn PersistentLayer>>> {
+        // Are we missing some files that are present in remote storage?
+        // Create RemoteLayer instances for them.
+        let mut local_only_layers = local_layers;
+
+        // We're holding a layer map lock for a while but this
+        // method is only called during init so it's fine.
+        let mut guard = self.layers.write().await;
+
+        let mut corrupted_local_layers = Vec::new();
+        let mut added_remote_layers = Vec::new();
+        for remote_layer_name in index_part.layer_metadata.keys() {
+            let local_layer = local_only_layers.remove(remote_layer_name);
+
+            let remote_layer_metadata = index_part
+                .layer_metadata
+                .get(remote_layer_name)
+                .map(LayerFileMetadata::from)
+                .with_context(|| {
+                    format!(
+                        "No remote layer metadata found for layer {}",
+                        remote_layer_name.file_name()
+                    )
+                })?;
+
+            // Is the local layer's size different from the size stored in the
+            // remote index file?
+            // If so, rename_to_backup those files & replace their local layer with
+            // a RemoteLayer in the layer map so that we re-download them on-demand.
+            if let Some(local_layer) = local_layer {
+                let local_layer_path = local_layer
+                    .local_path()
+                    .expect("caller must ensure that local_layers only contains local layers");
+                ensure!(
+                    local_layer_path.exists(),
+                    "every layer from local_layers must exist on disk: {}",
+                    local_layer_path.display()
+                );
+
+                let remote_size = remote_layer_metadata.file_size();
+                let metadata = local_layer_path.metadata().with_context(|| {
+                    format!(
+                        "get file size of local layer {}",
+                        local_layer_path.display()
+                    )
+                })?;
+                let local_size = metadata.len();
+                if local_size != remote_size {
+                    warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
+                    if let Err(err) = rename_to_backup(&local_layer_path) {
+                        assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display());
+                        anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
+                    } else {
+                        self.metrics.resident_physical_size_gauge.sub(local_size);
+                        corrupted_local_layers.push(local_layer);
+                        // fall-through to adding the remote layer
+                    }
+                } else {
+                    debug!(
+                        "layer is present locally and file size matches remote, using it: {}",
+                        local_layer_path.display()
+                    );
+                    continue;
+                }
+            }
+
+            info!(
+                "remote layer does not exist locally, creating remote layer: {}",
+                remote_layer_name.file_name()
+            );
+
+            match remote_layer_name {
+                LayerFileName::Image(imgfilename) => {
+                    if imgfilename.lsn > up_to_date_disk_consistent_lsn {
+                        info!(
+                        "found future image layer {} on timeline {} remote_consistent_lsn is {}",
+                        imgfilename, self.timeline_id, up_to_date_disk_consistent_lsn
+                    );
+                        continue;
+                    }
+                    let stats =
+                        LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted);
+
+                    let remote_layer = RemoteLayer::new_img(
+                        self.tenant_id,
+                        self.timeline_id,
+                        imgfilename,
+                        &remote_layer_metadata,
+                        stats,
+                    );
+                    let remote_layer = Arc::new(remote_layer);
+                    added_remote_layers.push(remote_layer);
+                }
+                LayerFileName::Delta(deltafilename) => {
+                    // Create a RemoteLayer for the delta file.
+                    // The end-LSN is exclusive, while disk_consistent_lsn is
+                    // inclusive. For example, if disk_consistent_lsn is 100, it is
+                    // OK for a delta layer to have end LSN 101, but if the end LSN
+                    // is 102, then it might not have been fully flushed to disk
+                    // before crash.
+                    if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 {
+                        info!(
+                            "found future delta layer {} on timeline {} remote_consistent_lsn is {}",
+                            deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn
+                        );
+                        continue;
+                    }
+                    let stats =
+                        LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted);
+
+                    let remote_layer = RemoteLayer::new_delta(
+                        self.tenant_id,
+                        self.timeline_id,
+                        deltafilename,
+                        &remote_layer_metadata,
+                        stats,
+                    );
+                    let remote_layer = Arc::new(remote_layer);
+                    added_remote_layers.push(remote_layer);
+                }
+            }
+        }
+        guard.initialize_remote_layers(corrupted_local_layers, added_remote_layers);
+        Ok(local_only_layers)
+    }
+
+    /// This function will synchronize local state with what we have in remote storage.
+    ///
+    /// Steps taken:
+    /// 1. Initialize upload queue based on `index_part`.
+    /// 2. Create `RemoteLayer` instances for layers that exist only on the remote.
+    ///    The list of layers on the remote comes from `index_part`.
+    ///    The list of local layers is given by the layer map's `iter_historic_layers()`.
+    ///    So, the layer map must have been loaded already.
+    /// 3. Schedule upload of local-only layer files (which will then also update the remote
+    ///    IndexPart to include the new layer files).
+    ///
+    /// Refer to the [`remote_timeline_client`] module comment for more context.
+    ///
+    /// # TODO
+    /// May be a bit cleaner to do things based on populated remote client,
+    /// and then do things based on its upload_queue.latest_files.
+    #[instrument(skip(self, index_part, up_to_date_metadata))]
+    pub async fn reconcile_with_remote(
+        &self,
+        up_to_date_metadata: &TimelineMetadata,
+        index_part: Option<&IndexPart>,
+    ) -> anyhow::Result<()> {
+        info!("starting");
+        let remote_client = self
+            .remote_client
+            .as_ref()
+            .ok_or_else(|| anyhow!("cannot download without remote storage"))?;
+
+        let disk_consistent_lsn = up_to_date_metadata.disk_consistent_lsn();
+
+        let local_layers = {
+            let guard = self.layers.read().await;
+            let layers = guard.layer_map();
+            layers
+                .iter_historic_layers()
+                .map(|l| (l.filename(), guard.get_from_desc(&l)))
+                .collect::<HashMap<_, _>>()
+        };
+
+        // If no writes happen, new branches do not have any layers, only the metadata file.
+        let has_local_layers = !local_layers.is_empty();
+        let local_only_layers = match index_part {
+            Some(index_part) => {
+                info!(
+                    "initializing upload queue from remote index with {} layer files",
+                    index_part.layer_metadata.len()
+                );
+                remote_client.init_upload_queue(index_part)?;
+                self.create_remote_layers(index_part, local_layers, disk_consistent_lsn)
+                    .await?
+            }
+            None => {
+                info!("initializing upload queue as empty");
+                remote_client.init_upload_queue_for_empty_remote(up_to_date_metadata)?;
+                local_layers
+            }
+        };
+
+        if has_local_layers {
+            // Are there local files that don't exist remotely? Schedule uploads for them.
+            // Local timeline metadata will get uploaded to remove along witht he layers.
+            for (layer_name, layer) in &local_only_layers {
+                // XXX solve this in the type system
+                let layer_path = layer
+                    .local_path()
+                    .expect("local_only_layers only contains local layers");
+                let layer_size = layer_path
+                    .metadata()
+                    .with_context(|| format!("failed to get file {layer_path:?} metadata"))?
+                    .len();
+                info!("scheduling {layer_path:?} for upload");
+                remote_client
+                    .schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?;
+            }
+            remote_client.schedule_index_upload_for_file_changes()?;
+        } else if index_part.is_none() {
+            // No data on the remote storage, no local layers, local metadata file.
+            //
+            // TODO https://github.com/neondatabase/neon/issues/3865
+            // Currently, console does not wait for the timeline data upload to the remote storage
+            // and considers the timeline created, expecting other pageserver nodes to work with it.
+            // Branch metadata upload could get interrupted (e.g pageserver got killed),
+            // hence any locally existing branch metadata with no remote counterpart should be uploaded,
+            // otherwise any other pageserver won't see the branch on `attach`.
+            //
+            // After the issue gets implemented, pageserver should rather remove the branch,
+            // since absence on S3 means we did not acknowledge the branch creation and console will have to retry,
+            // no need to keep the old files.
+            remote_client.schedule_index_upload_for_metadata_update(up_to_date_metadata)?;
+        } else {
+            // Local timeline has a metadata file, remote one too, both have no layers to sync.
+        }
+
+        info!("Done");
+
        Ok(())
    }

@@ -2674,6 +2852,7 @@ impl Timeline {
            if let Some(ref l) = delta_layer_to_add {
                // TODO: move access stats, metrics update, etc. into layer manager.
                l.access_stats().record_residence_event(
+                    &guard,
                    LayerResidenceStatus::Resident,
                    LayerResidenceEventReason::LayerCreate,
                );
@@ -3062,6 +3241,7 @@ impl Timeline {
                .add(metadata.len());
            let l = Arc::new(l);
            l.access_stats().record_residence_event(
+                &guard,
                LayerResidenceStatus::Resident,
                LayerResidenceEventReason::LayerCreate,
            );
@@ -3741,6 +3921,7 @@ impl Timeline {

            new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
            l.access_stats().record_residence_event(
+                &guard,
                LayerResidenceStatus::Resident,
                LayerResidenceEventReason::LayerCreate,
            );
@@ -4659,8 +4840,7 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> {
    for i in 0u32.. {
        new_path.set_file_name(format!("{filename}.{i}.old"));
        if !new_path.exists() {
-            std::fs::rename(path, &new_path)
-                .with_context(|| format!("rename {path:?} to {new_path:?}"))?;
+            std::fs::rename(path, &new_path)?;
            return Ok(());
        }
    }
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -1,199 +0,0 @@
-use crate::{
-    is_temporary,
-    tenant::{
-        ephemeral_file::is_ephemeral_file,
-        remote_timeline_client::{
-            self,
-            index::{IndexPart, LayerFileMetadata},
-        },
-        storage_layer::LayerFileName,
-    },
-    METADATA_FILE_NAME,
-};
-use anyhow::Context;
-use std::{collections::HashMap, ffi::OsString, path::Path, str::FromStr};
-use utils::lsn::Lsn;
-
-/// Identified files in the timeline directory.
-pub(super) enum Discovered {
-    /// The only one we care about
-    Layer(LayerFileName, u64),
-    /// Old ephmeral files from previous launches, should be removed
-    Ephemeral(OsString),
-    /// Old temporary timeline files, unsure what these really are, should be removed
-    Temporary(OsString),
-    /// Temporary on-demand download files, should be removed
-    TemporaryDownload(OsString),
-    /// "metadata" file we persist locally and include in `index_part.json`
-    Metadata,
-    /// Backup file from previously future layers
-    IgnoredBackup,
-    /// Unrecognized, warn about these
-    Unknown(OsString),
-}
-
-/// Scans the timeline directory for interesting files.
-pub(super) fn scan_timeline_dir(path: &Path) -> anyhow::Result<Vec<Discovered>> {
-    let mut ret = Vec::new();
-
-    for direntry in std::fs::read_dir(path)? {
-        let direntry = direntry?;
-        let direntry_path = direntry.path();
-        let file_name = direntry.file_name();
-
-        let fname = file_name.to_string_lossy();
-
-        let discovered = match LayerFileName::from_str(&fname) {
-            Ok(file_name) => {
-                let file_size = direntry.metadata()?.len();
-                Discovered::Layer(file_name, file_size)
-            }
-            Err(_) => {
-                if fname == METADATA_FILE_NAME {
-                    Discovered::Metadata
-                } else if fname.ends_with(".old") {
-                    // ignore these
-                    Discovered::IgnoredBackup
-                } else if remote_timeline_client::is_temp_download_file(&direntry_path) {
-                    Discovered::TemporaryDownload(file_name)
-                } else if is_ephemeral_file(&fname) {
-                    Discovered::Ephemeral(file_name)
-                } else if is_temporary(&direntry_path) {
-                    Discovered::Temporary(file_name)
-                } else {
-                    Discovered::Unknown(file_name)
-                }
-            }
-        };
-
-        ret.push(discovered);
-    }
-
-    Ok(ret)
-}
-
-/// Decision on what to do with a layer file after considering its local and remote metadata.
-#[derive(Clone)]
-pub(super) enum Decision {
-    /// The layer is not present locally.
-    Evicted(LayerFileMetadata),
-    /// The layer is present locally, but local metadata does not match remote; we must
-    /// delete it and treat it as evicted.
-    UseRemote {
-        local: LayerFileMetadata,
-        remote: LayerFileMetadata,
-    },
-    /// The layer is present locally, and metadata matches.
-    UseLocal(LayerFileMetadata),
-    /// The layer is only known locally, it needs to be uploaded.
-    NeedsUpload(LayerFileMetadata),
-}
-
-/// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded.
-#[derive(Debug)]
-pub(super) struct FutureLayer {
-    /// The local metadata. `None` if the layer is only known through [`IndexPart`].
-    pub(super) local: Option<LayerFileMetadata>,
-}
-
-/// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
-///
-/// This function should not gain additional reasons to fail than [`FutureLayer`], consider adding
-/// the checks earlier to [`scan_timeline_dir`].
-pub(super) fn reconcile(
-    discovered: Vec<(LayerFileName, u64)>,
-    index_part: Option<&IndexPart>,
-    disk_consistent_lsn: Lsn,
-) -> Vec<(LayerFileName, Result<Decision, FutureLayer>)> {
-    use Decision::*;
-
-    // name => (local, remote)
-    type Collected = HashMap<LayerFileName, (Option<LayerFileMetadata>, Option<LayerFileMetadata>)>;
-
-    let mut discovered = discovered
-        .into_iter()
-        .map(|(name, file_size)| (name, (Some(LayerFileMetadata::new(file_size)), None)))
-        .collect::<Collected>();
-
-    // merge any index_part information, when available
-    index_part
-        .as_ref()
-        .map(|ip| ip.layer_metadata.iter())
-        .into_iter()
-        .flatten()
-        .map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
-        .for_each(|(name, metadata)| {
-            if let Some(existing) = discovered.get_mut(name) {
-                existing.1 = Some(metadata);
-            } else {
-                discovered.insert(name.to_owned(), (None, Some(metadata)));
-            }
-        });
-
-    discovered
-        .into_iter()
-        .map(|(name, (local, remote))| {
-            let decision = if name.is_in_future(disk_consistent_lsn) {
-                Err(FutureLayer { local })
-            } else {
-                Ok(match (local, remote) {
-                    (Some(local), Some(remote)) if local != remote => UseRemote { local, remote },
-                    (Some(x), Some(_)) => UseLocal(x),
-                    (None, Some(x)) => Evicted(x),
-                    (Some(x), None) => NeedsUpload(x),
-                    (None, None) => {
-                        unreachable!("there must not be any non-local non-remote files")
-                    }
-                })
-            };
-
-            (name, decision)
-        })
-        .collect::<Vec<_>>()
-}
-
-pub(super) fn cleanup(path: &Path, kind: &str) -> anyhow::Result<()> {
-    let file_name = path.file_name().expect("must be file path");
-    tracing::debug!(kind, ?file_name, "cleaning up");
-    std::fs::remove_file(path)
-        .with_context(|| format!("failed to remove {kind} at {}", path.display()))
-}
-
-pub(super) fn cleanup_local_file_for_remote(
-    path: &Path,
-    local: &LayerFileMetadata,
-    remote: &LayerFileMetadata,
-) -> anyhow::Result<()> {
-    let local_size = local.file_size();
-    let remote_size = remote.file_size();
-
-    let file_name = path.file_name().expect("must be file path");
-    tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
-    if let Err(err) = crate::tenant::timeline::rename_to_backup(path) {
-        assert!(
-            path.exists(),
-            "we would leave the local_layer without a file if this does not hold: {}",
-            path.display()
-        );
-        Err(err)
-    } else {
-        Ok(())
-    }
-}
-
-pub(super) fn cleanup_future_layer(
-    path: &Path,
-    name: &LayerFileName,
-    disk_consistent_lsn: Lsn,
-) -> anyhow::Result<()> {
-    use LayerFileName::*;
-    let kind = match name {
-        Delta(_) => "delta",
-        Image(_) => "image",
-    };
-    // future image layers are allowed to be produced always for not yet flushed to disk
-    // lsns stored in InMemoryLayer.
-    tracing::info!("found future {kind} layer {name} disk_consistent_lsn is {disk_consistent_lsn}");
-    crate::tenant::timeline::rename_to_backup(path)?;
-    Ok(())
-}
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -13,7 +13,7 @@ use crate::{
        layer_map::{BatchedUpdates, LayerMap},
        storage_layer::{
            AsLayerDesc, DeltaLayer, ImageLayer, InMemoryLayer, PersistentLayer,
-            PersistentLayerDesc, PersistentLayerKey,
+            PersistentLayerDesc, PersistentLayerKey, RemoteLayer,
        },
        timeline::compare_arced_layers,
    },
@@ -85,6 +85,21 @@ impl LayerManager {
        self.layer_map.next_open_layer_at = Some(next_open_layer_at);
    }

+    pub(crate) fn initialize_remote_layers(
+        &mut self,
+        corrupted_local_layers: Vec<Arc<dyn PersistentLayer>>,
+        remote_layers: Vec<Arc<RemoteLayer>>,
+    ) {
+        let mut updates = self.layer_map.batch_update();
+        for layer in corrupted_local_layers {
+            Self::remove_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
+        }
+        for layer in remote_layers {
+            Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
+        }
+        updates.flush();
+    }
+
    /// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
    /// called within `get_layer_for_write`.
    pub(crate) fn get_layer_for_write(
@@ -250,6 +265,16 @@ impl LayerManager {
        mapping.insert(layer);
    }

+    /// Helper function to remove a layer into the layer map and file manager
+    fn remove_historic_layer(
+        layer: Arc<dyn PersistentLayer>,
+        updates: &mut BatchedUpdates<'_>,
+        mapping: &mut LayerFileManager,
+    ) {
+        updates.remove_historic(layer.layer_desc());
+        mapping.remove(layer);
+    }
+
    /// Removes the layer from local FS (if present) and from memory.
    /// Remote storage is not affected by this operation.
    fn delete_historic_layer(
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -17,7 +17,7 @@ use crate::metrics::{
    WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
    WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
 };
-use crate::task_mgr::{shutdown_token, TaskKind};
+use crate::task_mgr::TaskKind;
 use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
@@ -211,14 +211,11 @@ async fn subscribe_for_timeline_updates(
    id: TenantTimelineId,
 ) -> Streaming<SafekeeperTimelineInfo> {
    let mut attempt = 0;
-    let cancel = shutdown_token();
-
    loop {
        exponential_backoff(
            attempt,
            DEFAULT_BASE_BACKOFF_SECONDS,
            DEFAULT_MAX_BACKOFF_SECONDS,
-            &cancel,
        )
        .await;
        attempt += 1;
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -148,16 +148,17 @@ impl UploadQueue {
            );
        }

+        let index_part_metadata = index_part.parse_metadata()?;
        info!(
            "initializing upload queue with remote index_part.disk_consistent_lsn: {}",
-            index_part.metadata.disk_consistent_lsn()
+            index_part_metadata.disk_consistent_lsn()
        );

        let state = UploadQueueInitialized {
            latest_files: files,
            latest_files_changes_since_metadata_upload_scheduled: 0,
-            latest_metadata: index_part.metadata.clone(),
-            last_uploaded_consistent_lsn: index_part.metadata.disk_consistent_lsn(),
+            latest_metadata: index_part_metadata.clone(),
+            last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
            // what follows are boring default initializations
            task_counter: 0,
            num_inprogress_layer_uploads: 0,
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -408,9 +408,9 @@ async fn connect_to_compute_once(
    let (tx, mut rx) = tokio::sync::watch::channel(session);

    let conn_id = uuid::Uuid::new_v4();
-    let span = info_span!(parent: None, "connection", %conn_id);
+    let span = info_span!(parent: None, "connection", %conn_info, %conn_id);
    span.in_scope(|| {
-        info!(%conn_info, %session, "new connection");
+        info!(%session, "new connection");
    });

    tokio::spawn(
@@ -420,28 +420,26 @@ async fn connect_to_compute_once(
                info!(%session, "changed session");
            }

-            loop {
-                let message = ready!(connection.poll_message(cx));
+            let message = ready!(connection.poll_message(cx));

-                match message {
-                    Some(Ok(AsyncMessage::Notice(notice))) => {
-                        info!(%session, "notice: {}", notice);
-                    }
-                    Some(Ok(AsyncMessage::Notification(notif))) => {
-                        warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
-                    }
-                    Some(Ok(_)) => {
-                        warn!(%session, "unknown message");
-                    }
-                    Some(Err(e)) => {
-                        error!(%session, "connection error: {}", e);
-                        return Poll::Ready(())
-                    }
-                    None => {
-                        info!("connection closed");
-                        return Poll::Ready(())
-                    }
+            match message {
+                Some(Ok(AsyncMessage::Notice(notice))) => {
+                    info!(%session, "notice: {}", notice);
+                    Poll::Pending
                }
+                Some(Ok(AsyncMessage::Notification(notif))) => {
+                    warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
+                    Poll::Pending
+                }
+                Some(Ok(_)) => {
+                    warn!(%session, "unknown message");
+                    Poll::Pending
+                }
+                Some(Err(e)) => {
+                    error!(%session, "connection error: {}", e);
+                    Poll::Ready(())
+                }
+                None => Poll::Ready(()),
            }
        })
        .instrument(span)
--- a/scripts/combine_control_files.py
+++ b/scripts/combine_control_files.py
@@ -0,0 +1,76 @@
+#! /usr/bin/env python3
+# Script to generate ext_index.json metadata file
+# that stores content of the control files and location of extension archives
+# for all extensions in extensions subdir.
+import argparse
+import json
+import subprocess
+from pathlib import Path
+
+"""
+# ext_index.json example:
+{
+    "public_extensions": [
+        "anon"
+    ],
+    "library_index": {
+        "anon": "anon",
+        // for more complex extensions like postgis
+        // we might have something like:
+        // address_standardizer: postgis
+        // postgis_tiger: postgis
+    },
+    "extension_data": {
+        "anon": {
+            "control_data": {
+                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
+            },
+            "archive_path": "5648391853/v15/extensions/anon.tar.zst"
+        }
+    }
+}
+"""
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="generate ext_index.json")
+    parser.add_argument("pg_version", type=str, choices=["v14", "v15"], help="pg_version")
+    parser.add_argument("BUILD_TAG", type=str, help="BUILD_TAG for this compute image")
+    parser.add_argument("--public_extensions", type=str, help="list of public extensions")
+    args = parser.parse_args()
+    pg_version = args.pg_version
+    BUILD_TAG = args.BUILD_TAG
+    public_ext_list = args.public_extensions.split(",")
+
+    ext_index = {}
+    library_index = {}
+    EXT_PATH = Path("extensions")
+    for extension in EXT_PATH.iterdir():
+        if extension.is_dir():
+            control_data = {}
+            for control_file in extension.glob("*.control"):
+                if control_file.suffix != ".control":
+                    continue
+                with open(control_file, "r") as f:
+                    control_data[control_file.name] = f.read()
+            ext_index[extension.name] = {
+                "control_data": control_data,
+                "archive_path": f"{BUILD_TAG}/{pg_version}/extensions/{extension.name}.tar.zst",
+            }
+        elif extension.suffix == ".zst":
+            file_list = (
+                str(subprocess.check_output(["tar", "tf", str(extension)]), "utf-8")
+                .strip()
+                .split("\n")
+            )
+            for file in file_list:
+                if file.endswith(".so") and file.startswith("lib/"):
+                    lib_name = file[4:-3]
+                    library_index[lib_name] = extension.name.replace(".tar.zst", "")
+
+    all_data = {
+        "public_extensions": public_ext_list,
+        "library_index": library_index,
+        "extension_data": ext_index,
+    }
+    with open("ext_index.json", "w") as f:
+        json.dump(all_data, f)
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -12,26 +12,25 @@ import psycopg2.extras
 # We call the test "flaky" if it failed at least once on the main branch in the last N=10 days.
 FLAKY_TESTS_QUERY = """
    SELECT
-        DISTINCT parent_suite, suite, REGEXP_REPLACE(test, '(release|debug)-pg(\\d+)-?', '') as deparametrized_test
+        DISTINCT parent_suite, suite, test
    FROM
        (
            SELECT
-                reference,
-                jsonb_array_elements(data -> 'children') ->> 'name' as parent_suite,
-                jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') ->> 'name' as suite,
-                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'name' as test,
-                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'status' as status,
-                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'retriesStatusChange' as retries_status_change,
-                to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' ->> 'start')::bigint / 1000)::date as timestamp
+                revision,
+                jsonb_array_elements(data -> 'children') -> 'name' as parent_suite,
+                jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'name' as suite,
+                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'name' as test,
+                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'status' as status,
+                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'retriesStatusChange' as retries_status_change,
+                to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp
            FROM
                regress_test_results
+            WHERE
+                reference = 'refs/heads/main'
        ) data
    WHERE
        timestamp > CURRENT_DATE - INTERVAL '%s' day
-        AND (
-            (status IN ('failed', 'broken') AND reference = 'refs/heads/main')
-            OR retries_status_change::boolean
-        )
+        AND (status::text IN ('"failed"', '"broken"') OR retries_status_change::boolean)
    ;
 """

@@ -41,9 +40,6 @@ def main(args: argparse.Namespace):
    interval_days = args.days
    output = args.output

-    build_type = args.build_type
-    pg_version = args.pg_version
-
    res: DefaultDict[str, DefaultDict[str, Dict[str, bool]]]
    res = defaultdict(lambda: defaultdict(dict))

@@ -59,21 +55,8 @@ def main(args: argparse.Namespace):
        rows = []

    for row in rows:
-        # We don't want to automatically rerun tests in a performance suite
-        if row["parent_suite"] != "test_runner.regress":
-            continue
-
-        deparametrized_test = row["deparametrized_test"]
-        dash_if_needed = "" if deparametrized_test.endswith("[]") else "-"
-        parametrized_test = deparametrized_test.replace(
-            "[",
-            f"[{build_type}-pg{pg_version}{dash_if_needed}",
-        )
-        res[row["parent_suite"]][row["suite"]][parametrized_test] = True
-
-        logging.info(
-            f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{parametrized_test}"
-        )
+        logging.info(f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}")
+        res[row["parent_suite"]][row["suite"]][row["test"]] = True

    logging.info(f"saving results to {output.name}")
    json.dump(res, output, indent=2)
@@ -94,18 +77,6 @@ if __name__ == "__main__":
        type=int,
        help="how many days to look back for flaky tests (default: 10)",
    )
-    parser.add_argument(
-        "--build-type",
-        required=True,
-        type=str,
-        help="for which build type to create list of flaky tests (debug or release)",
-    )
-    parser.add_argument(
-        "--pg-version",
-        required=True,
-        type=int,
-        help="for which Postgres version to create list of flaky tests (14, 15, etc.)",
-    )
    parser.add_argument(
        "connstr",
        help="connection string to the test results database",
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -233,19 +233,10 @@ if TYPE_CHECKING:

 def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
    response = list_prefix(neon_env_builder, prefix)
-    keys = response["KeyCount"]
-    objects = response.get("Contents", [])
-
-    if keys != 0 and len(objects) == 0:
-        # this has been seen in one case with mock_s3:
-        # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4938/6000769714/index.html#suites/3556ed71f2d69272a7014df6dcb02317/ca01e4f4d8d9a11f
-        # looking at moto impl, it might be there's a race with common prefix (sub directory) not going away with deletes
-        common_prefixes = response.get("CommonPrefixes", [])
-        log.warn(
-            f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}"
-        )
-
-    assert keys == 0, f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
+    objects = response.get("Contents")
+    assert (
+        response["KeyCount"] == 0
+    ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"


 def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -369,7 +369,7 @@ def test_download_remote_layers_api(
    filled_current_physical = get_api_current_physical_size()
    log.info(filled_current_physical)
    filled_size = get_resident_physical_size()
-    log.info(f"filled_size: {filled_size}")
+    log.info(filled_size)
    assert filled_current_physical == filled_size, "we don't yet do layer eviction"

    env.pageserver.stop()
@@ -377,7 +377,7 @@ def test_download_remote_layers_api(
    # remove all the layer files
    # XXX only delete some of the layer files, to show that it really just downloads all the layers
    for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"):
-        log.info(f"unlinking layer {layer.name}")
+        log.info(f"unlinking layer {layer}")
        layer.unlink()

    # Shut down safekeepers before starting the pageserver.
@@ -403,7 +403,7 @@ def test_download_remote_layers_api(
        filled_current_physical == get_api_current_physical_size()
    ), "current_physical_size is sum of loaded layer sizes, independent of whether local or remote"
    post_unlink_size = get_resident_physical_size()
-    log.info(f"post_unlink_size: {post_unlink_size}")
+    log.info(post_unlink_size)
    assert (
        post_unlink_size < filled_size
    ), "we just deleted layers and didn't cause anything to re-download them yet"
--- a/vm-cgconfig.conf
+++ b/vm-cgconfig.conf
@@ -0,0 +1,12 @@
+# Configuration for cgroups in VM compute nodes
+group neon-postgres {
+    perm {
+        admin {
+            uid = vm-informant;
+        }
+        task {
+            gid = users;
+        }
+    }
+    memory {}
+}
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -14,13 +14,11 @@ publish = false
 ### BEGIN HAKARI SECTION
 [dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
-axum = { version = "0.6", features = ["ws"] }
 bytes = { version = "1", features = ["serde"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] }
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
-digest = { version = "0.10", features = ["mac", "std"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 futures = { version = "0.3" }
@@ -29,7 +27,6 @@ futures-core = { version = "0.3" }
 futures-executor = { version = "0.3" }
 futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
-hyper = { version = "0.14", features = ["full"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
@@ -48,7 +45,6 @@ rustls = { version = "0.20", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["raw_value"] }
-smallvec = { version = "1", default-features = false, features = ["write"] }
 socket2 = { version = "0.4", default-features = false, features = ["all"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-rustls = { version = "0.23" }
@@ -58,6 +54,7 @@ toml_edit = { version = "0.19", features = ["serde"] }
 tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
+tracing-subscriber = { version = "0.3", default-features = false, features = ["env-filter", "fmt", "json", "smallvec", "tracing-log"] }
 url = { version = "2", features = ["serde"] }

 [build-dependencies]
Author	SHA1	Message	Date
Alek Westover	595baa386e	alphabetize	2023-08-23 13:24:42 -04:00
Alek Westover	bb8ca7c7fd	allow v16	2023-08-23 13:23:51 -04:00