Merge pull request #5161 from neondatabase/rc-2023-08-31

Release 2023-08-31
pageserver: fix flake in test_timeline_deletion_with_files_stuck_in_upload_queue (#5149 )
2026-07-16 02:20:38 +00:00 · 2023-08-31 16:53:17 +03:00 · 2023-08-31 10:42:32 +01:00 · 2023-08-31 10:40:46 +01:00 · 2023-08-31 12:23:51 +03:00 · 2023-08-31 09:19:34 +01:00
86 changed files with 5659 additions and 1659 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -145,7 +145,11 @@ runs:

        if [ "${RERUN_FLAKY}" == "true" ]; then
          mkdir -p $TEST_OUTPUT
-          poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/flaky.json"
+          poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" \
+                                              --days 7 \
+                                              --output "$TEST_OUTPUT/flaky.json" \
+                                              --pg-version "${DEFAULT_PG_VERSION}" \
+                                              --build-type "${BUILD_TYPE}"

          EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
        fi
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -737,34 +737,6 @@ jobs:
                           --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
                           --cleanup

-      # Due to a kaniko bug, we can't use cache for extensions image, thus it takes about the same amount of time as compute-node image to build (~10 min)
-      # During the transition period we need to have extensions in both places (in S3 and in compute-node image),
-      # so we won't build extension twice, but extract them from compute-node.
-      #
-      # For now we use extensions image only for new custom extensitons
-      - name: Kaniko build extensions only
-        run: |
-          # Kaniko is suposed to clean up after itself if --cleanup flag is set, but it doesn't.
-          # Despite some fixes were made in https://github.com/GoogleContainerTools/kaniko/pull/2504 (in kaniko v1.11.0),
-          # it still fails with error:
-          #   error building image: could not save file: copying file: symlink postgres /kaniko/1/usr/local/pgsql/bin/postmaster: file exists
-          #
-          # Ref https://github.com/GoogleContainerTools/kaniko/issues/1406
-          find /kaniko -maxdepth 1 -mindepth 1 -type d -regex "/kaniko/[0-9]*" -exec rm -rv {} \;
-
-          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
-                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
-                           --context . \
-                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} \
-                           --build-arg PG_VERSION=${{ matrix.version }} \
-                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
-                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
-                           --dockerfile Dockerfile.compute-node \
-                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-                           --destination neondatabase/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-                           --cleanup \
-                           --target postgres-extensions
-
      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr
@@ -780,7 +752,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.16.3
+      VM_BUILDER_VERSION: v0.17.5

    steps:
      - name: Checkout
@@ -803,7 +775,7 @@ jobs:
        run: |
          ./vm-builder \
            -enable-file-cache \
-            -enable-monitor \
+            -cgroup-uid=postgres \
            -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
            -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

@@ -886,10 +858,8 @@ jobs:
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} latest

      - name: Push images to production ECR
        if: |
@@ -900,10 +870,8 @@ jobs:
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:latest

      - name: Configure Docker Hub login
        run: |
@@ -925,10 +893,8 @@ jobs:
          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/extensions-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/extensions-v15:${{needs.tag.outputs.build-tag}} latest

      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr
@@ -938,7 +904,7 @@ jobs:
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
-    needs: [ promote-images, tag ]
+    needs: [ tag ]
    steps:
      - name: Set PR's status to pending and request a remote CI test
        run: |
@@ -973,57 +939,10 @@ jobs:
              }
            }"

-  upload-postgres-extensions-to-s3:
-    if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
-       github.event_name != 'workflow_dispatch'
-    runs-on: ${{ github.ref_name == 'release' && fromJSON('["self-hosted", "prod", "x64"]') || fromJSON('["self-hosted", "gen3", "small"]') }}
-    needs: [ tag, promote-images ]
-    strategy:
-      fail-fast: false
-      matrix:
-        version: [ v14, v15 ]
-
-    env:
-      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ github.ref_name == 'release' && 'latest' || needs.tag.outputs.build-tag }}
-      AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
-      AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
-      S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}
-
-    steps:
-      - name: Pull postgres-extensions image
-        run: |
-          docker pull ${EXTENSIONS_IMAGE}
-
-      - name: Create postgres-extensions container
-        id: create-container
-        run: |
-          EID=$(docker create ${EXTENSIONS_IMAGE} true)
-          echo "EID=${EID}" >> $GITHUB_OUTPUT
-
-      - name: Extract postgres-extensions from container
-        run: |
-          rm -rf ./extensions-to-upload # Just in case
-          mkdir -p extensions-to-upload
-
-          docker cp ${{ steps.create-container.outputs.EID }}:/extensions/ ./extensions-to-upload/
-          docker cp ${{ steps.create-container.outputs.EID }}:/ext_index.json ./extensions-to-upload/
-
-      - name: Upload postgres-extensions to S3
-        run: |
-          for BUCKET in $(echo ${S3_BUCKETS:-[]} | jq --raw-output '.[]'); do
-            aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
-          done
-
-      - name: Cleanup
-        if: ${{ always() && steps.create-container.outputs.EID }}
-        run: |
-          docker rm ${{ steps.create-container.outputs.EID }} || true
-
  deploy:
    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    needs: [ upload-postgres-extensions-to-s3, promote-images, tag, regress-tests ]
+    needs: [ promote-images, tag, regress-tests ]
    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
    steps:
      - name: Fix git ownership
--- a/13
+++ b/13
@@ -1,11 +1,12 @@
-/compute_tools/ @neondatabase/control-plane
+/compute_tools/ @neondatabase/control-plane @neondatabase/compute
 /control_plane/ @neondatabase/compute @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
-/libs/postgres_ffi/ @neondatabase/compute 
-/libs/remote_storage/ @neondatabase/storage 
-/libs/safekeeper_api/ @neondatabase/safekeepers  
-/pageserver/ @neondatabase/compute @neondatabase/storage 
+/libs/postgres_ffi/ @neondatabase/compute
+/libs/remote_storage/ @neondatabase/storage
+/libs/safekeeper_api/ @neondatabase/safekeepers
+/libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
+/pageserver/ @neondatabase/compute @neondatabase/storage
 /pgxn/ @neondatabase/compute
-/proxy/ @neondatabase/control-plane 
+/proxy/ @neondatabase/proxy
 /safekeeper/ @neondatabase/safekeepers
 /vendor/ @neondatabase/compute
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,6 +23,7 @@ members = [
    "libs/remote_storage",
    "libs/tracing-utils",
    "libs/postgres_ffi/wal_craft",
+    "libs/vm_monitor",
 ]

 [workspace.package]
@@ -36,17 +37,19 @@ async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
-aws-config = { version = "0.55", default-features = false, features=["rustls"] }
-aws-sdk-s3 = "0.27"
-aws-smithy-http = "0.55"
-aws-credential-types = "0.55"
-aws-types = "0.55"
+aws-config = { version = "0.56", default-features = false, features=["rustls"] }
+aws-sdk-s3 = "0.29"
+aws-smithy-http = "0.56"
+aws-credential-types = "0.56"
+aws-types = "0.56"
+axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
 bindgen = "0.65"
 bstr = "1.0"
 byteorder = "1.4"
 bytes = "1.0"
+cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["derive"] }
 close_fds = "0.3.2"
@@ -74,6 +77,7 @@ humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
 hyper-tungstenite = "0.9"
+inotify = "0.10.2"
 itertools = "0.10"
 jsonwebtoken = "8"
 libc = "0.2"
@@ -101,16 +105,18 @@ reqwest-middleware = "0.2.0"
 reqwest-retry = "0.2.2"
 routerify = "3"
 rpds = "0.13"
-rustls = "0.20"
+rustls = "0.21"
 rustls-pemfile = "1"
 rustls-split = "0.3"
 scopeguard = "1.1"
-sentry = { version = "0.30", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
+sysinfo = "0.29.2"
+sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_with = "2.0"
 sha2 = "0.10.2"
 signal-hook = "0.3"
+smallvec = "1.11"
 socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
@@ -119,11 +125,11 @@ sync_wrapper = "0.1.2"
 tar = "0.4"
 test-context = "0.1"
 thiserror = "1.0"
-tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] }
+tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
 tokio = { version = "1.17", features = ["macros"] }
 tokio-io-timeout = "1.2.0"
-tokio-postgres-rustls = "0.9.0"
-tokio-rustls = "0.23"
+tokio-postgres-rustls = "0.10.0"
+tokio-rustls = "0.24"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
 tokio-util = { version = "0.7", features = ["io"] }
@@ -133,11 +139,11 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.19.0"
-tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] }
+tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 url = "2.2"
 uuid = { version = "1.2", features = ["v4", "serde"] }
 walkdir = "2.3.2"
-webpki-roots = "0.23"
+webpki-roots = "0.25"
 x509-parser = "0.15"

 ## TODO replace this with tracing
@@ -169,14 +175,15 @@ storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main br
 tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
+vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }

 ## Common library dependency
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }

 ## Build dependencies
 criterion = "0.5.1"
-rcgen = "0.10"
-rstest = "0.17"
+rcgen = "0.11"
+rstest = "0.18"
 tempfile = "3.4"
 tonic-build = "0.9"

--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -211,8 +211,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.4.tar.gz -O pgvector.tar.gz && \
-    echo "1cb70a63f8928e396474796c22a20be9f7285a8a013009deb8152445b61b72e6 pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.0.tar.gz -O pgvector.tar.gz && \
+    echo "d8aa3504b215467ca528525a6de12c3f85f9891b091ce0e5864dd8a9b757f77b pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -764,29 +764,6 @@ RUN rm -r /usr/local/pgsql/include
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a

-#########################################################################################
-#
-# Extenstion only
-#
-#########################################################################################
-FROM python:3.9-slim-bullseye AS generate-ext-index
-ARG PG_VERSION
-ARG BUILD_TAG
-RUN apt update && apt install -y zstd
-
-# copy the control files here
-COPY --from=kq-imcx-pg-build /extensions/ /extensions/
-COPY --from=pg-anon-pg-build /extensions/ /extensions/
-COPY --from=postgis-build /extensions/ /extensions/
-COPY scripts/combine_control_files.py ./combine_control_files.py
-RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG} --public_extensions="anon,postgis"
-
-FROM scratch AS postgres-extensions
-# After the transition this layer will include all extensitons.
-# As for now, it's only a couple for testing purposses
-COPY --from=generate-ext-index /extensions/*.tar.zst /extensions/
-COPY --from=generate-ext-index /ext_index.json /ext_index.json
-
 #########################################################################################
 #
 # Final layer
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -8,6 +8,7 @@ license.workspace = true
 anyhow.workspace = true
 async-compression.workspace = true
 chrono.workspace = true
+cfg-if.workspace = true
 clap.workspace = true
 flate2.workspace = true
 futures.workspace = true
@@ -23,6 +24,7 @@ tar.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
+tokio-util.workspace = true
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
@@ -34,4 +36,5 @@ utils.workspace = true
 workspace_hack.workspace = true
 toml_edit.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
+vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.12.4"
--- a/compute_tools/README.md
+++ b/compute_tools/README.md
@@ -19,9 +19,10 @@ Also `compute_ctl` spawns two separate service threads:
 - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
  last activity requests.

-If the `vm-informant` binary is present at `/bin/vm-informant`, it will also be started. For VM
-compute nodes, `vm-informant` communicates with the VM autoscaling system. It coordinates
-downscaling and (eventually) will request immediate upscaling under resource pressure.
+If `AUTOSCALING` environment variable is set, `compute_ctl` will start the
+`vm-monitor` located in [`neon/libs/vm_monitor`]. For VM compute nodes,
+`vm-monitor` communicates with the VM autoscaling system. It coordinates
+downscaling and requests immediate upscaling under resource pressure.

 Usage example:
 ```sh
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -20,9 +20,10 @@
 //! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
 //!   last activity requests.
 //!
-//! If the `vm-informant` binary is present at `/bin/vm-informant`, it will also be started. For VM
-//! compute nodes, `vm-informant` communicates with the VM autoscaling system. It coordinates
-//! downscaling and (eventually) will request immediate upscaling under resource pressure.
+//! If `AUTOSCALING` environment variable is set, `compute_ctl` will start the
+//! `vm-monitor` located in [`neon/libs/vm_monitor`]. For VM compute nodes,
+//! `vm-monitor` communicates with the VM autoscaling system. It coordinates
+//! downscaling and requests immediate upscaling under resource pressure.
 //!
 //! Usage example:
 //! ```sh
@@ -35,7 +36,6 @@
 //!
 use std::collections::HashMap;
 use std::fs::File;
-use std::panic;
 use std::path::Path;
 use std::process::exit;
 use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
@@ -271,6 +271,57 @@ fn main() -> Result<()> {
        }
    };

+    // Start the vm-monitor if directed to. The vm-monitor only runs on linux
+    // because it requires cgroups.
+    cfg_if::cfg_if! {
+        if #[cfg(target_os = "linux")] {
+            use std::env;
+            use tokio_util::sync::CancellationToken;
+            use tracing::warn;
+            let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
+            let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
+            let cgroup = matches.get_one::<String>("cgroup");
+            let file_cache_on_disk = matches.get_flag("file-cache-on-disk");
+
+            // Only make a runtime if we need to.
+            // Note: it seems like you can make a runtime in an inner scope and
+            // if you start a task in it it won't be dropped. However, make it
+            // in the outermost scope just to be safe.
+            let rt = match (env::var_os("AUTOSCALING"), vm_monitor_addr) {
+                (None, None) => None,
+                (None, Some(_)) => {
+                    warn!("--vm-monitor-addr option set but AUTOSCALING env var not present");
+                    None
+                }
+                (Some(_), None) => {
+                    panic!("AUTOSCALING env var present but --vm-monitor-addr option not set")
+                }
+                (Some(_), Some(_)) => Some(
+                    tokio::runtime::Builder::new_multi_thread()
+                        .worker_threads(4)
+                        .enable_all()
+                        .build()
+                        .expect("failed to create tokio runtime for monitor"),
+                ),
+            };
+
+            // This token is used internally by the monitor to clean up all threads
+            let token = CancellationToken::new();
+
+            let vm_monitor = &rt.as_ref().map(|rt| {
+                rt.spawn(vm_monitor::start(
+                    Box::leak(Box::new(vm_monitor::Args {
+                        cgroup: cgroup.cloned(),
+                        pgconnstr: file_cache_connstr.cloned(),
+                        addr: vm_monitor_addr.cloned().unwrap(),
+                        file_cache_on_disk,
+                    })),
+                    token.clone(),
+                ))
+            });
+        }
+    }
+
    // Wait for the child Postgres process forever. In this state Ctrl+C will
    // propagate to Postgres and it will be shut down as well.
    if let Some(mut pg) = pg {
@@ -284,6 +335,24 @@ fn main() -> Result<()> {
        exit_code = ecode.code()
    }

+    // Terminate the vm_monitor so it releases the file watcher on
+    // /sys/fs/cgroup/neon-postgres.
+    // Note: the vm-monitor only runs on linux because it requires cgroups.
+    cfg_if::cfg_if! {
+        if #[cfg(target_os = "linux")] {
+            if let Some(handle) = vm_monitor {
+                // Kills all threads spawned by the monitor
+                token.cancel();
+                // Kills the actual task running the monitor
+                handle.abort();
+
+                // If handle is some, rt must have been used to produce it, and
+                // hence is also some
+                rt.unwrap().shutdown_timeout(Duration::from_secs(2));
+            }
+        }
+    }
+
    // Maybe sync safekeepers again, to speed up next startup
    let compute_state = compute.state.lock().unwrap().clone();
    let pspec = compute_state.pspec.as_ref().expect("spec must be set");
@@ -393,6 +462,34 @@ fn cli() -> clap::Command {
                .long("remote-ext-config")
                .value_name("REMOTE_EXT_CONFIG"),
        )
+        // TODO(fprasx): we currently have default arguments because the cloud PR
+        // to pass them in hasn't been merged yet. We should get rid of them once
+        // the PR is merged.
+        .arg(
+            Arg::new("vm-monitor-addr")
+                .long("vm-monitor-addr")
+                .default_value("0.0.0.0:10301")
+                .value_name("VM_MONITOR_ADDR"),
+        )
+        .arg(
+            Arg::new("cgroup")
+                .long("cgroup")
+                .default_value("neon-postgres")
+                .value_name("CGROUP"),
+        )
+        .arg(
+            Arg::new("filecache-connstr")
+                .long("filecache-connstr")
+                .default_value(
+                    "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable",
+                )
+                .value_name("FILECACHE_CONNSTR"),
+        )
+        .arg(
+            Arg::new("file-cache-on-disk")
+                .long("file-cache-on-disk")
+                .action(clap::ArgAction::SetTrue),
+        )
 }

 #[test]
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -1,12 +1,39 @@
-use anyhow::{anyhow, Result};
+use anyhow::{anyhow, Ok, Result};
+use postgres::Client;
 use tokio_postgres::NoTls;
 use tracing::{error, instrument};

 use crate::compute::ComputeNode;

+/// Create a special service table for availability checks
+/// only if it does not exist already.
+pub fn create_availability_check_data(client: &mut Client) -> Result<()> {
+    let query = "
+        DO $$
+        BEGIN
+            IF NOT EXISTS(
+                SELECT 1
+                FROM pg_catalog.pg_tables
+                WHERE tablename = 'health_check'
+            )
+            THEN
+            CREATE TABLE health_check (
+                id serial primary key,
+                updated_at timestamptz default now()
+            );
+            INSERT INTO health_check VALUES (1, now())
+                ON CONFLICT (id) DO UPDATE
+                 SET updated_at = now();
+            END IF;
+        END
+        $$;";
+    client.execute(query, &[])?;
+
+    Ok(())
+}
+
 /// Update timestamp in a row in a special service table to check
 /// that we can actually write some data in this particular timeline.
-/// Create table if it's missing.
 #[instrument(skip_all)]
 pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
    // Connect to the database.
@@ -24,19 +51,15 @@ pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
    });

    let query = "
-    CREATE TABLE IF NOT EXISTS health_check (
-        id serial primary key,
-        updated_at timestamptz default now()
-    );
    INSERT INTO health_check VALUES (1, now())
        ON CONFLICT (id) DO UPDATE
         SET updated_at = now();";

    let result = client.simple_query(query).await?;

-    if result.len() != 2 {
+    if result.len() != 1 {
        return Err(anyhow::format_err!(
-            "expected 2 query results, but got {}",
+            "expected 1 query result, but got {}",
            result.len()
        ));
    }
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,4 +1,5 @@
 use std::collections::HashMap;
+use std::env;
 use std::fs;
 use std::io::BufRead;
 use std::os::unix::fs::PermissionsExt;
@@ -26,6 +27,7 @@ use utils::measured_stream::MeasuredReader;

 use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};

+use crate::checker::create_availability_check_data;
 use crate::pg_helpers::*;
 use crate::spec::*;
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
@@ -175,6 +177,27 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
    }
 }

+/// If we are a VM, returns a [`Command`] that will run in the `neon-postgres`
+/// cgroup. Otherwise returns the default `Command::new(cmd)`
+///
+/// This function should be used to start postgres, as it will start it in the
+/// neon-postgres cgroup if we are a VM. This allows autoscaling to control
+/// postgres' resource usage. The cgroup will exist in VMs because vm-builder
+/// creates it during the sysinit phase of its inittab.
+fn maybe_cgexec(cmd: &str) -> Command {
+    // The cplane sets this env var for autoscaling computes.
+    // use `var_os` so we don't have to worry about the variable being valid
+    // unicode. Should never be an concern . . . but just in case
+    if env::var_os("AUTOSCALING").is_some() {
+        let mut command = Command::new("cgexec");
+        command.args(["-g", "memory:neon-postgres"]);
+        command.arg(cmd);
+        command
+    } else {
+        Command::new(cmd)
+    }
+}
+
 /// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
 /// that we give to customers
 fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
@@ -451,7 +474,7 @@ impl ComputeNode {
    pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
        let start_time = Utc::now();

-        let sync_handle = Command::new(&self.pgbin)
+        let sync_handle = maybe_cgexec(&self.pgbin)
            .args(["--sync-safekeepers"])
            .env("PGDATA", &self.pgdata) // we cannot use -D in this mode
            .envs(if let Some(storage_auth_token) = &storage_auth_token {
@@ -586,7 +609,7 @@ impl ComputeNode {

        // Start postgres
        info!("starting postgres");
-        let mut pg = Command::new(&self.pgbin)
+        let mut pg = maybe_cgexec(&self.pgbin)
            .args(["-D", pgdata])
            .spawn()
            .expect("cannot start postgres process");
@@ -614,7 +637,7 @@ impl ComputeNode {
        let pgdata_path = Path::new(&self.pgdata);

        // Run postgres as a child process.
-        let mut pg = Command::new(&self.pgbin)
+        let mut pg = maybe_cgexec(&self.pgbin)
            .args(["-D", &self.pgdata])
            .envs(if let Some(storage_auth_token) = &storage_auth_token {
                vec![("NEON_AUTH_TOKEN", storage_auth_token)]
@@ -674,6 +697,7 @@ impl ComputeNode {
        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
        handle_grants(spec, self.connstr.as_str())?;
        handle_extensions(spec, &mut client)?;
+        create_availability_check_data(&mut client)?;

        // 'Close' connection
        drop(client);
@@ -1056,7 +1080,8 @@ LIMIT 100",

        let mut download_tasks = Vec::new();
        for library in &libs_vec {
-            let (ext_name, ext_path) = remote_extensions.get_ext(library, true)?;
+            let (ext_name, ext_path) =
+                remote_extensions.get_ext(library, true, &self.build_tag, &self.pgversion)?;
            download_tasks.push(self.download_extension(ext_name, ext_path));
        }
        let results = join_all(download_tasks).await;
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -108,12 +108,10 @@ pub fn get_pg_version(pgbin: &str) -> String {
    // pg_config --version returns a (platform specific) human readable string
    // such as "PostgreSQL 15.4". We parse this to v14/v15
    let human_version = get_pg_config("--version", pgbin);
-    if human_version.contains("14") {
-        return "v14".to_string();
-    } else if human_version.contains("15") {
+    if human_version.contains("15") {
        return "v15".to_string();
-    } else if human_version.contains("16") {
-        return "v16".to_string();
+    } else if human_version.contains("14") {
+        return "v14".to_string();
    }
    panic!("Unsuported postgres version {human_version}");
 }
@@ -182,7 +180,19 @@ pub async fn download_extension(
 // Create extension control files from spec
 pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
-    for ext_data in remote_extensions.extension_data.values() {
+    for (ext_name, ext_data) in remote_extensions.extension_data.iter() {
+        // Check if extension is present in public or custom.
+        // If not, then it is not allowed to be used by this compute.
+        if let Some(public_extensions) = &remote_extensions.public_extensions {
+            if !public_extensions.contains(ext_name) {
+                if let Some(custom_extensions) = &remote_extensions.custom_extensions {
+                    if !custom_extensions.contains(ext_name) {
+                        continue; // skip this extension, it is not allowed
+                    }
+                }
+            }
+        }
+
        for (control_name, control_content) in &ext_data.control_data {
            let control_path = local_sharedir.join(control_name);
            if !control_path.exists() {
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -169,7 +169,12 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
                    }
                };

-                remote_extensions.get_ext(&filename, is_library)
+                remote_extensions.get_ext(
+                    &filename,
+                    is_library,
+                    &compute.build_tag,
+                    &compute.pgversion,
+                )
            };

            match ext {
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -138,7 +138,13 @@ impl ComputeControlPlane {
            mode,
            tenant_id,
            pg_version,
-            skip_pg_catalog_updates: false,
+            // We don't setup roles and databases in the spec locally, so we don't need to
+            // do catalog updates. Catalog updates also include check availability
+            // data creation. Yet, we have tests that check that size and db dump
+            // before and after start are the same. So, skip catalog updates,
+            // with this we basically test a case of waking up an idle compute, where
+            // we also skip catalog updates in the cloud.
+            skip_pg_catalog_updates: true,
        });

        ep.create_endpoint_dir()?;
@@ -152,7 +158,7 @@ impl ComputeControlPlane {
                http_port,
                pg_port,
                pg_version,
-                skip_pg_catalog_updates: false,
+                skip_pg_catalog_updates: true,
            })?,
        )?;
        std::fs::write(
--- a/docs/rfcs/025-generation-numbers.md
+++ b/docs/rfcs/025-generation-numbers.md
@@ -0,0 +1,957 @@
+# Pageserver: split-brain safety for remote storage through generation numbers
+
+## Summary
+
+A scheme of logical "generation numbers" for tenant attachment to pageservers is proposed, along with
+changes to the remote storage format to include these generation numbers in S3 keys.
+
+Using the control plane as the issuer of these generation numbers enables strong anti-split-brain
+properties in the pageserver cluster without implementing a consensus mechanism directly
+in the pageservers.
+
+## Motivation
+
+Currently, the pageserver's remote storage format does not provide a mechanism for addressing
+split brain conditions that may happen when replacing a node or when migrating
+a tenant from one pageserver to another.
+
+From a remote storage perspective, a split brain condition occurs whenever two nodes both think
+they have the same tenant attached, and both can write to S3. This can happen in the case of a
+network partition, pathologically long delays (e.g. suspended VM), or software bugs.
+
+In the current deployment model, control plane guarantees that a tenant is attached to one
+pageserver at a time, thereby ruling out split-brain conditions resulting from dual
+attachment (however, there is always the risk of a control plane bug). This control
+plane guarantee prevents robust response to failures, as if a pageserver is unresponsive
+we may not detach from it. The mechanism in this RFC fixes this, by making it safe to
+attach to a new, different pageserver even if an unresponsive pageserver may be running.
+
+Futher, lack of safety during split-brain conditions blocks two important features where occasional
+split-brain conditions are part of the design assumptions:
+
+- seamless tenant migration ([RFC PR](https://github.com/neondatabase/neon/pull/5029))
+- automatic pageserver instance failure handling (aka "failover") (RFC TBD)
+
+### Prior art
+
+- 020-pageserver-s3-coordination.md
+- 023-the-state-of-pageserver-tenant-relocation.md
+- 026-pageserver-s3-mvcc.md
+
+This RFC has broad similarities to the proposal to implement a MVCC scheme in
+S3 object names, but this RFC avoids a general purpose transaction scheme in
+favour of more specialized "generations" that work like a transaction ID that
+always has the same lifetime as a pageserver process or tenant attachment, whichever
+is shorter.
+
+## Requirements
+
+- Accommodate storage backends with no atomic or fencing capability (i.e. work within
+  S3's limitation that there are no atomics and clients can't be fenced)
+- Don't depend on any STONITH or node fencing in the compute layer (i.e. we will not
+  assume that we can reliably kill and EC2 instance and have it die)
+- Scoped per-tenant, not per-pageserver; for _seamless tenant migration_, we need
+  per-tenant granularity, and for _failover_, we likely want to spread the workload
+  of the failed pageserver instance to a number of peers, rather than monolithically
+  moving the entire workload to another machine.
+  We do not rule out the latter case, but should not constrain ourselves to it.
+
+## Design Tenets
+
+These are not requirements, but are ideas that guide the following design:
+
+- Avoid implementing another consensus system: we already have a strongly consistent
+  database in the control plane that can do atomic operations where needed, and we also
+  have a Paxos implementation in the safekeeper.
+- Avoiding locking in to specific models of how failover will work (e.g. do not assume that
+  all the tenants on a pageserver will fail over as a unit).
+- Be strictly correct when it comes to data integrity. Occasional failures of availability
+  are tolerable, occasional data loss is not.
+
+## Non Goals
+
+The changes in this RFC intentionally isolate the design decision of how to define
+logical generations numbers and object storage format in a way that is somewhat flexible with
+respect to how actual orchestration of failover works.
+
+This RFC intentionally does not cover:
+
+- Failure detection
+- Orchestration of failover
+- Standby modes to keep data ready for fast migration
+- Intentional multi-writer operation on tenants (multi-writer scenarios are assumed to be transient split-brain situations).
+- Sharding.
+
+The interaction between this RFC and those features is discussed in [Appendix B](#appendix-b-interoperability-with-other-features)
+
+## Impacted Components
+
+pageserver, control plane, safekeeper (optional)
+
+## Implementation Part 1: Correctness
+
+### Summary
+
+- A per-tenant **generation number** is introduced to uniquely identifying tenant attachments to pageserver processes.
+
+  - This generation number increments each time the control plane modifies a tenant (`Project`)'s assigned pageserver, or when the assigned pageserver restarts.
+  - the control plane is the authority for generation numbers: only it may
+    increment a generation number.
+
+- **Object keys are suffixed** with the generation number
+- **Safety for multiply-attached tenants** is provided by the
+  generation number in the object key: the competing pageservers will not
+  try to write to the same keys.
+- **Safety in split brain for multiple nodes running with
+  the same node ID** is provided by the pageserver calling out to the control plane
+  on startup, to re-attach and thereby increment the generations of any attached tenants
+- **Safety for deletions** is achieved by deferring the DELETE from S3 to a point in time where the deleting node has validated with control plane that no attachment with a higher generation has a reference to the to-be-DELETEd key.
+- **The control plane is used to issue generation numbers** to avoid the need for
+  a built-in consensus system in the pageserver, although this could in principle
+  be changed without changing the storage format.
+
+### Generation numbers
+
+A generation number is associated with each tenant in the control plane,
+and each time the attachment status of the tenant changes, this is incremented.
+Changes in attachment status include:
+
+- Attaching the tenant to a different pageserver
+- A pageserver restarting, and "re-attaching" its tenants on startup
+
+These increments of attachment generation provide invariants we need to avoid
+split-brain issues in storage:
+
+- If two pageservers have the same tenant attached, the attachments are guaranteed to have different generation numbers, because the generation would increment
+  while attaching the second one.
+- If there are multiple pageservers running with the same node ID, all the attachments on all pageservers are guaranteed to have different generation numbers, because the generation would increment
+  when the second node started and re-attached its tenants.
+
+As long as the infrastructure does not transparently replace an underlying
+physical machine, we are totally safe. See the later [unsafe case](#unsafe-case-on-badly-behaved-infrastructure) section for details.
+
+### Object Key Changes
+
+#### Generation suffix
+
+All object keys (layer objects and index objects) will contain the attachment
+generation as a [suffix](#why-a-generation-suffix-rather-than-prefix).
+This suffix is the primary mechanism for protecting against split-brain situations, and
+enabling safe multi-attachment of tenants:
+
+- Two pageservers running with the same node ID (e.g. after a failure, where there is
+  some rogue pageserver still running) will not try to write to the same objects, because at startup they will have re-attached tenants and thereby incremented
+  generation numbers.
+- Multiple attachments (to different pageservers) of the same tenant will not try to write to the same objects, as each attachment would have a distinct generation.
+
+The generation is appended in hex format (8 byte string representing
+u32), to all our existing key names. A u32's range limit would permit
+27 restarts _per second_ over a 5 year system lifetime: orders of magnitude more than
+is realistic.
+
+The exact meaning of the generation suffix can evolve over time if necessary, for
+example if we chose to implement a failover mechanism internally to the pageservers
+rather than going via the control plane. The storage format just sees it as a number,
+with the only semantic property being that the highest numbered index is the latest.
+
+#### Index changes
+
+Since object keys now include a generation suffix, the index of these keys must also be updated. IndexPart currently stores keys and LSNs sufficient to reconstruct key names: this would be extended to store the generation as well.
+
+This will increase the size of the file, but only modestly: layers are already encoded as
+their string-ized form, so the overhead is about 10 bytes per layer. This will be less if/when
+the index storage format is migrated to a binary format from JSON.
+
+#### Visibility
+
+_This section doesn't describe code changes, but extends on the consequences of the
+object key changes given above_
+
+##### Visibility of objects to pageservers
+
+Pageservers can of course list objects in S3 at any time, but in practice their
+visible set is based on the contents of their LayerMap, which is initialized
+from the `index_part.json.???` that they load.
+
+Starting with the `index_part` from the most recent previous generation
+(see [loading index_part](#finding-the-remote-indices-for-timelines)), a pageserver
+initially has visibility of all the objects that were referenced in the loaded index.
+These objects are guaranteed to remain visible until the current generation is
+superseded, via pageservers in older generations avoiding deletions (see [deletion](#deletion)).
+
+The "most recent previous generation" is _not_ necessarily the most recent
+in terms of walltime, it is the one that is readable at the time a new generation
+starts. Consider the following sequence of a tenant being re-attached to different
+pageserver nodes:
+
+- Create + attach on PS1 in generation 1
+- PS1 Do some work, write out index_part.json-0001
+- Attach to PS2 in generation 2
+- Read index_part.json-0001
+- PS2 starts doing some work...
+- Attach to PS3 in generation 3
+- Read index_part.json-0001
+- **...PS2 finishes its work: now it writes index_part.json-0002**
+- PS3 writes out index_part.json-0003
+
+In the above sequence, the ancestry of indices is:
+
+```
+0001 -> 0002
+     |
+     -> 0003
+```
+
+This is not an issue for safety: if the 0002 references some object that is
+not in 0001, then 0003 simply does not see it, and will re-do whatever
+work was required (e.g. ingesting WAL or doing compaction). Objects referenced
+by only the 0002 index will never be read by future attachment generations, and
+will eventually be cleaned up by a scrub (see [scrubbing](#cleaning-up-orphan-objects-scrubbing)).
+
+##### Visibility of LSNs to clients
+
+Because index_part.json is now written with a generation suffix, which data
+is visible depends on which generation the reader is operating in:
+
+- If one was passively reading from S3 from outside of a pageserver, the
+  visibility of data would depend on which index_part.json-<generation> file
+  one had chosen to read from.
+- If two pageservers have the same tenant attached, they may have different
+  data visible as they're independently replaying the WAL, and maintaining
+  independent LayerMaps that are written to independent index_part.json files.
+  Data does not have to be remotely committed to be visible.
+- For a pageserver writing with a stale generation, historic LSNs
+  remain readable until another pageserver (with a higher generation suffix)
+  decides to execute GC deletions. At this point, we may think of the stale
+  attachment's generation as having logically ended: during its existence
+  the generation had a consistent view of the world.
+- For a newly attached pageserver, its highest visible LSN may appears to
+  go backwards with respect to an earlier attachment, if that earlier
+  attachment had not uploaded all data to S3 before the new attachment.
+
+### Deletion
+
+#### Generation number validation
+
+While writes are de-conflicted by writers always using their own generation number in the key,
+deletions are slightly more challenging: if a pageserver A is isolated, and the true active node is
+pageserver B, then it is dangerous for A to do any object deletions, even of objects that it wrote
+itself, because pageserver's B metadata might reference those objects.
+
+We solve this by inserting a "generation validation" step between the write of a remote index
+that un-links a particular object from the index, and the actual deletion of the object, such
+that deletions strictly obey the following ordering:
+
+1. Write out index_part.json: this guarantees that any subsequent reader of the metadata will
+   not try and read the object we unlinked.
+2. Call out to control plane to validate that the generation which we use for our attachment is still the latest.
+3. If step 2 passes, it is safe to delete the object. Why? The check-in with control plane
+   together with our visibility rules guarantees that any later generation
+   will use either the exact `index_part.json` that we uploaded in step 1, or a successor
+   of it; not an earlier one. In both cases, the `index_part.json` doesn't reference the
+   key we are deleting anymore, so, the key is invisible to any later attachment generation.
+   Hence it's safe to delete it.
+
+Note that at step 2 we are only confirming that deletions of objects _no longer referenced
+by the specific `index_part.json` written in step 1_ are safe. If we were attempting other deletions concurrently,
+these would need their own generation validation step.
+
+If step 2 fails, we may leak the object. This is safe, but has a cost: see [scrubbing](#cleaning-up-orphan-objects-scrubbing). We may avoid this entirely outside of node
+failures, if we do proper flushing of deletions on clean shutdown and clean migration.
+
+To avoid doing a huge number of control plane requests to perform generation validation,
+validation of many tenants will be done in a single request, and deletions will be queued up
+prior to validation: see [Persistent deletion queue](#persistent-deletion-queue) for more.
+
+#### `remote_consistent_lsn` updates
+
+Remote objects are not the only kind of deletion the pageserver does: it also indirectly deletes
+WAL data, by feeding back remote_consistent_lsn to safekeepers, as a signal to the safekeepers that
+they may drop data below this LSN.
+
+For the same reasons that deletion of objects must be guarded by an attachment generation number
+validation step, updates to `remote_consistent_lsn` are subject to the same rules, using
+an ordering as follows:
+
+1. upload the index_part that covers data up to LSN `L0` to S3
+2. Call out to control plane to validate that the generation which we use for our attachment is still the latest.
+3. advance the `remote_consistent_lsn` that we advertise to the safekeepers to `L0`
+
+If step 2 fails, then the `remote_consistent_lsn` advertised
+to safekeepers will not advance again until a pageserver
+with the latest generation is ready to do so.
+
+**Note:** at step 3 we are not advertising the _latest_ remote_consistent_lsn, we are
+advertising the value in the index_part that we uploaded in step 1. This provides
+a strong ordering guarantee.
+
+Internally to the pageserver, each timeline will have two remote_consistent_lsn values: the one that
+reflects its latest write to remote storage, and the one that reflects the most
+recent validation of generation number. It is only the latter value that may
+be advertised to the outside world (i.e. to the safekeeper).
+
+The control plane remains unaware of `remote_consistent_lsn`: it only has to validate
+the freshness of generation numbers, thereby granting the pageserver permission to
+share the information with the safekeeper.
+
+For convenience, in subsequent sections and RFCs we will use "deletion" to mean both deletion
+of objects in S3, and updates to the `remote_consistent_lsn`, as updates to the remote consistent
+LSN are de-facto deletions done via the safekeeper, and both kinds of deletion are subject to
+the same generation validation requirement.
+
+### Pageserver attach/startup changes
+
+#### Attachment
+
+Calls to `/v1/tenant/{tenant_id}/attach` are augmented with an additional
+`generation` field in the body.
+
+The pageserver does not persist this: a generation is only good for the lifetime
+of a process.
+
+#### Finding the remote indices for timelines
+
+Because index files are now suffixed with generation numbers, the pageserver
+cannot always GET the remote index in one request, because it can't always
+know a-priori what the latest remote index is.
+
+Typically, the most recent generation to write an index would be our own
+generation minus 1. However, this might not be the case: the previous
+node might have started and acquired a generation number, and then crashed
+before writing out a remote index.
+
+In the general case and as a fallback, the pageserver may list all the `index_part.json`
+files for a timeline, sort them by generation, and pick the highest that is `<=`
+its current generation for this attachment. The tenant should never load an index
+with an attachment generation _newer_ than its own.
+These two rules combined ensure that objects written by later generations are never visible to earlier generations.
+
+Note that if a given attachment picks an index part from an earlier generation (say n-2), but crashes & restarts before it writes its own generation's index part, next time it tries to pick an index part there may be an index part from generation n-1.
+It would pick the n-1 index part in that case, because it's sorted higher than the previous one from generation n-2.
+So, above rules guarantee no determinism in selecting the index part.
+are allowed to be attached with stale attachment generations during a multiply-attached
+phase in a migration, and in this instance if the old location's pageserver restarts,
+it should not try and load the newer generation's index.
+
+To summarize, on starting a timeline, the pageserver will:
+
+1. Issue a GET for index_part.json-<my generation - 1>
+2. If 1 failed, issue a ListObjectsv2 request for index_part.json\* and
+   pick the newest.
+
+One could optimize this further by using the control plane to record specifically
+which generation most recently wrote an index_part.json, if necessary, to increase
+the probability of finding the index_part.json in one GET. One could also improve
+the chances by having pageservers proactively write out index_part.json after they
+get a new generation ID.
+
+#### Re-attachment on startup
+
+On startup, the pageserver will call out to an new control plane `/re-attach`
+API (see [Generation API](#generation-api)). This returns a list of
+tenants that should be attached to the pageserver, and their generation numbers, which
+the control plane will increment before returning.
+
+The pageserver should still scan its local disk on startup, but should _delete_
+any local content for tenants not indicated in the `/re-attach` response: their
+absence is an implicit detach operation.
+
+**Note** if a tenant is omitted from the re-attach response, its local disk content
+will be deleted. This will change in subsequent work, when the control plane gains
+the concept of a secondary/standby location: a node with local content may revert
+to this status and retain some local content.
+
+#### Cleaning up previous generations' remote indices
+
+Deletion of old indices is not necessary for correctness, although it is necessary
+to avoid the ListObjects fallback in the previous section becoming ever more expensive.
+
+Once the new attachment has written out its index_part.json, it may asynchronously clean up historic index_part.json
+objects that were found.
+
+We may choose to implement this deletion either as an explicit step after we
+write out index_part for the first time in a pageserver's lifetime, or for
+simplicity just do it periodically as part of the background scrub (see [scrubbing](#cleaning-up-orphan-objects-scrubbing));
+
+### Control Plane Changes
+
+#### Store generations for attaching tenants
+
+- The `Project` table must store the generation number for use when
+  attaching the tenant to a new pageserver.
+- The `/v1/tenant/:tenant_id/attach` pageserver API will require the generation number,
+  which the control plane can supply by simply incrementing the `Project`'s
+  generation number each time the tenant is attached to a different server: the same database
+  transaction that changes the assigned pageserver should also change the generation number.
+
+#### Generation API
+
+This section describes an API that could be provided directly by the control plane,
+or built as a separate microservice. In earlier parts of the RFC, when we
+discuss the control plane providing generation numbers, we are referring to this API.
+
+The API endpoints used by the pageserver to acquire and validate generation
+numbers are quite simple, and only require access to some persistent and
+linerizable storage (such as a database).
+
+Building this into the control plane is proposed as a least-effort option to exploit existing infrastructure and implement generation number issuance in the same transaction that mandates it (i.e., the transaction that updates the `Project` assignment to another pageserver).
+However, this is not mandatory: this "Generation Number Issuer" could
+be built as a microservice. In practice, we will write such a miniature service
+anyway, to enable E2E pageserver/compute testing without control plane.
+
+The endpoints required by pageservers are:
+
+##### `/re-attach`
+
+- Request: `{node_id: <u32>}`
+- Response:
+  - 200 `{tenants: [{id: <TenantId>, gen: <u32>}]}`
+  - 404: unknown node_id
+  - (Future: 429: flapping detected, perhaps nodes are fighting for the same node ID,
+    or perhaps this node was in a retry loop)
+  - (On unknown tenants, omit tenant from `tenants` array)
+- Server behavior: query database for which tenants should be attached to this pageserver.
+  - for each tenant that should be attached, increment the attachment generation and
+    include the new generation in the response
+- Client behavior:
+  - for all tenants in the response, activate with the new generation number
+  - for any local disk content _not_ referenced in the response, act as if we
+    had been asked to detach it (i.e. delete local files)
+
+**Note** the `node_id` in this request will change in future if we move to ephemeral
+node IDs, to be replaced with some correlation ID that helps the control plane realize
+if a process is running with the same storage as a previous pageserver process (e.g.
+we might use EC instance ID, or we might just write some UUID to the disk the first
+time we use it)
+
+##### `/validate`
+
+- Request: `{'tenants': [{tenant: <tenant id>, attach_gen: <gen>}, ...]}'`
+- Response:
+  - 200 `{'tenants': [{tenant: <tenant id>, status: <bool>}...]}`
+  - (On unknown tenants, omit tenant from `tenants` array)
+- Purpose: enable the pageserver to discover for the given attachments whether they are still the latest.
+- Server behavior: this is a read-only operation: simply compare the generations in the request with
+  the generations known to the server, and set status to `true` if they match.
+- Client behavior: clients must not do deletions within a tenant's remote data until they have
+  received a response indicating the generation they hold for the attachment is current.
+
+#### Use of `/load` and `/ignore` APIs
+
+Because the pageserver will be changed to only attach tenants on startup
+based on the control plane's response to a `/re-attach` request, the load/ignore
+APIs no longer make sense in their current form.
+
+The `/load` API becomes functionally equivalent to attach, and will be removed:
+any location that used `/load` before should just attach instead.
+
+The `/ignore` API is equivalent to detaching, but without deleting local files.
+
+### Timeline/Branch creation & deletion
+
+All of the previous arguments for safety have described operations within
+a timeline, where we may describe a sequence that includes updates to
+index_part.json, and where reads and writes are coming from a postgres
+endpoint (writes via the safekeeper).
+
+Creating or destroying timeline is a bit different, because writes
+are coming from the control plane.
+
+We must be safe against scenarios such as:
+
+- A tenant is attached to pageserver B while pageserver A is
+  in the middle of servicing an RPC from the control plane to
+  create or delete a tenant.
+- A pageserver A has been sent a timeline creation request
+  but becomes unresponsive. The tenant is attached to a
+  different pageserver B, and the timeline creation request
+  is sent there too.
+
+#### Timeline Creation
+
+If some very slow node tries to do a timeline creation _after_
+a more recent generation node has already created the timeline
+and written some data into it, that must not cause harm. This
+is provided in timeline creations by the way all the objects
+within the timeline's remote path include a generation suffix:
+a slow node in an old generation that attempts to "create" a timeline
+that already exists will just emit an index_part.json with
+an old generation suffix.
+
+Timeline IDs are never reused, so we don't have
+to worry about the case of create/delete/create cycles. If they
+were re-used during a disaster recovery "un-delete" of a timeline,
+that special case can be handled by calling out to all available pageservers
+to check that they return 404 for the timeline, and to flush their
+deletion queues in case they had any deletions pending from the
+timeline.
+
+The above makes it safe for control plane to change the assignment of
+tenant to pageserver in control plane while a timeline creation is ongoing.
+The reason is that the creation request against the new assigned pageserver
+uses a new generation number. However, care must be taken by control plane
+to ensure that a "timeline creation successul" response from some pageserver
+is checked for the pageserver's generation for that timeline's tenant still being the latest.
+If it is not the latest, the response does not constitute a successful timeline creation.
+It is acceptable to discard such responses, the scrubber will clean up the S3 state.
+It is better to issue a timelien deletion request to the stale attachment.
+
+#### Timeline Deletion
+
+Tenant/timeline deletion operations are exempt from generation validation
+on deletes, and therefore don't have to go through the same deletion
+queue as GC/compaction layer deletions. This is because once a
+delete is issued by the control plane, it is a promise that the
+control plane will keep trying until the deletion is done, so even stale
+pageservers are permitted to go ahead and delete the objects.
+
+The implications of this for control plane are:
+
+- During timeline/tenant deletion, the control plane must wait for the deletion to
+  be truly complete (status 404) and also handle the case where the pageserver
+  becomes unavailable, either by waiting for a replacement with the same node_id,
+  or by *re-attaching the tenant elsewhere.
+
+- The control plane must persist its intent to delete
+  a timeline/tenant before issuing any RPCs, and then once it starts, it must
+  keep retrying until the tenant/timeline is gone. This is already handled
+  by using a persistent `Operation` record that is retried indefinitely.
+
+Timeline deletion may result in a special kind of object leak, where
+the latest generation attachment completes a deletion (including erasing
+all objects in the timeline path), but some slow/partitioned node is
+writing into the timeline path with a stale generation number. This would
+not be caught by any per-timeline scrubbing (see [scrubbing](#cleaning-up-orphan-objects-scrubbing)), since scrubbing happens on the
+attached pageserver, and once the timeline is deleted it isn't attached anywhere.
+This scenario should be pretty rare, and the control plane can make it even
+rarer by ensuring that if a tenant is in a multi-attached state (e.g. during
+migration), we wait for that to complete before processing the deletion. Beyond
+that, we may implement some other top-level scrub of timelines in
+an external tool, to identify any tenant/timeline paths that are not found
+in the control plane database.
+
+#### Examples
+
+- Deletion, node restarts partway through:
+  - By the time we returned 202, we have written a remote delete marker
+  - Any subsequent incarnation of the same node_id will see the remote
+    delete marker and continue to process the deletion
+  - If the original pageserver is lost permanently and no replacement
+    with the same node_id is available, then the control plane must recover
+    by re-attaching the tenant to a different node.
+- Creation, node becomes unresponsive partway through.
+  - Control plane will see HTTP request timeout, keep re-issuing
+    request to whoever is the latest attachment point for the tenant
+    until it succeeds.
+  - Stale nodes may be trying to execute timeline creation: they will
+    write out index_part.json files with
+    stale attachment generation: these will be eventually cleaned up
+    by the same mechanism as other old indices.
+
+### Unsafe case on badly behaved infrastructure
+
+This section is only relevant if running on a different environment
+than EC2 machines with ephemeral disks.
+
+If we ever run pageservers on infrastructure that might transparently restart
+a pageserver while leaving an old process running (e.g. a VM gets rescheduled
+without the old one being fenced), then there is a risk of corruption, when
+the control plane attaches the tenant, as follows:
+
+- If the control plane sends an `/attach` request to node A, then node A dies
+  and is replaced, and the control plane's retries the request without
+  incrementing that attachment ID, then it could end up with two physical nodes
+  both using the same generation number.
+- This is not an issue when using EC2 instances with ephemeral storage, as long
+  as the control plane never re-uses a node ID, but it would need re-examining
+  if running on different infrastructure.
+- To robustly protect against this class of issue, we would either:
+  - add a "node generation" to distinguish between different processes holding the
+    same node_id.
+  - or, dispense with static node_id entirely and issue an ephemeral ID to each
+    pageserver process when it starts.
+
+## Implementation Part 2: Optimizations
+
+### Persistent deletion queue
+
+Between writing our a new index_part.json that doesn't reference an object,
+and executing the deletion, an object passes through a window where it is
+only referenced in memory, and could be leaked if the pageserver is stopped
+uncleanly. That introduces conflicting incentives: on the one hand, we would
+like to delay and batch deletions to
+1. minimize the cost of the mandatory validations calls to control plane, and
+2. minimize cost for DeleteObjects requests.
+On the other hand we would also like to minimize leakage by executing
+deletions promptly.
+
+To resolve this, we may make the deletion queue persistent
+and then executing these in the background at a later time.
+
+_Note: The deletion queue's reason for existence is optimization rather than correctness,
+so there is a lot of flexibility in exactly how the it should work,
+as long as it obeys the rule to validate generations before executing deletions,
+so the following details are not essential to the overall RFC._
+
+#### Scope
+
+The deletion queue will be global per pageserver, not per-tenant. There
+are several reasons for this choice:
+
+- Use the queue as a central point to coalesce validation requests to the
+  control plane: this avoids individual `Timeline` objects ever touching
+  the control plane API, and avoids them having to know the rules about
+  validating deletions. This separation of concerns will avoid burdening
+  the already many-LoC `Timeline` type with even more responsibility.
+- Decouple the deletion queue from Tenant attachment lifetime: we may
+  "hibernate" an inactive tenant by tearing down its `Tenant`/`Timeline`
+  objects in the pageserver, without having to wait for deletions to be done.
+- Amortize the cost of I/O for the persistent queue, instead of having many
+  tiny queues.
+- Coalesce deletions into a smaller number of larger DeleteObjects calls
+
+Because of the cost of doing I/O for persistence, and the desire to coalesce
+generation validation requests across tenants, and coalesce deletions into
+larger DeleteObjects requests, there will be one deletion queue per pageserver
+rather than one per tenant. This has the added benefit that when deactivating
+a tenant, we do not have to drain their deletion queue: deletions can proceed
+for a tenant whose main `Tenant` object has been torn down.
+
+#### Flow of deletion
+
+The flow of a deletion is becomes:
+
+1. Need for deletion of an object (=> layer file) is identified.
+2. Unlink the object from all the places that reference it (=> `index_part.json`).
+3. Enqueue the deletion to a persistent queue.
+   Each entry is `tenant_id, attachment_generation, S3 key`.
+4. Validate & execute in batches:
+  4.1 For a batch of entries, call into control plane.
+  4.2 For the subset of entries that passed validation, execute a `DeleteObjects` S3 DELETE request for their S3 keys.
+
+As outlined in the Part 1 on correctness, it is critical that deletions are only
+executed once the key is not referenced anywhere in S3.
+This property is obviously upheld by the scheme above.
+
+#### We Accept Object Leakage In Acceptable Circumcstances
+
+If we crash in the flow above between (2) and (3), we lose track of unreferenced object.
+Further, enqueuing a single to the persistent queue may not be durable immediately to amortize cost of flush to disk.
+This is acceptable for now, it can be caught by [the scrubber](#cleaning-up-orphan-objects-scrubbing).
+
+There are various measures we can take to improve this in the future.
+1. Cap amount of time until enqueued entry becomes durable (timeout for flush-to-tisk)
+2. Proactively flush:
+    - On graceful shutdown, as we anticipate that some or
+      all of our attachments may be re-assigned while we are offline.
+    - On tenant detach.
+3. For each entry, keep track of whether it has passed (2).
+   Only admit entries to (4) one they have passed (2).
+   This requires re-writing / two queue entries (intent, commit) per deletion.
+
+The important take-away with any of the above is that it's not
+disastrous to leak objects in exceptional circumstances.
+
+#### Operations that may skip the queue
+
+Deletions of an entire timeline are [exempt](#Timeline-Deletion) from generation number validation. Once the
+control plane sends the deletion request, there is no requirement to retain the readability
+of any data within the timeline, and all objects within the timeline path may be deleted
+at any time from the control plane's deletion request onwards.
+
+Since deletions of smaller timelines won't have enough objects to compose a full sized
+DeleteObjects request, it is still useful to send these through the last part of the
+deletion pipeline to coalesce with other executing deletions: to enable this, the
+deletion queue should expose two input channels: one for deletions that must be
+processed in a generation-aware way, and a fast path for timeline deletions, where
+that fast path may skip validation and the persistent queue.
+
+### Cleaning up orphan objects (scrubbing)
+
+An orphan object is any object which is no longer referenced by a running node or by metadata.
+
+Examples of how orphan objects arise:
+
+- A node PUTs a layer object, then crashes before it writes the
+  index_part.json that references that layer.
+- A stale node carries on running for some time, and writes out an unbounded number of
+  objects while it believes itself to be the rightful writer for a tenant.
+- A pageserver crashes between un-linking an object from the index, and persisting
+  the object to its deletion queue.
+
+Orphan objects are functionally harmless, but have a small cost due to S3 capacity consumed. We
+may clean them up at some time in the future, but doing a ListObjectsv2 operation and cross
+referencing with the latest metadata to identify objects which are not referenced.
+
+Scrubbing will be done only by an attached pageserver (not some third party process), and deletions requested during scrub will go through the same
+validation as all other deletions: the attachment generation must be
+fresh. This avoids the possibility of a stale pageserver incorrectly
+thinking than an object written by a newer generation is stale, and deleting
+it.
+
+It is not strictly necessary that scrubbing be done by an attached
+pageserver: it could also be done externally. However, an external
+scrubber would still require the same validation procedure that
+a pageserver's deletion queue performs, before actually erasing
+objects.
+
+## Operational impact
+
+### Availability
+
+Coordination of generation numbers via the control plane introduce a dependency for certain
+operations:
+
+1. Starting new pageservers (or activating pageservers after a restart)
+2. Executing enqueued deletions
+3. Advertising updated `remote_consistent_lsn` to enable WAL trimming
+
+Item 1. would mean that some in-place restarts that previously would have resumed service even if the control plane were
+unavailable, will now not resume service to users until the control plane is available. We could
+avoid this by having a timeout on communication with the control plane, and after some timeout,
+resume service with the previous generation numbers (assuming this was persisted to disk). However,
+this is unlikely to be needed as the control plane is already an essential & highly available component. Also, having a node re-use an old generation number would complicate
+reasoning about the system, as it would break the invariant that a generation number uniquely identifies
+a tenant's attachment to a given pageserver _process_: it would merely identify the tenant's attachment
+to the pageserver _machine_ or its _on-disk-state_.
+
+Item 2. is a non-issue operationally: it's harmless to delay deletions, the only impact of objects pending deletion is
+the S3 capacity cost.
+
+Item 3. could be an issue if safekeepers are low on disk space and the control plane is unavailable for a long time. If this became an issue,
+we could adjust the safekeeper to delete segments from local disk sooner, as soon as they're uploaded to S3, rather than waiting for
+remote_consistent_lsn to advance.
+
+For a managed service, the general approach should be to make sure we are monitoring & respond fast enough
+that control plane outages are bounded in time.
+
+There is also the fact that control plane runs in a single region.
+The latency for distant regions is not a big concern for us because all request types added by this RFC are either infrequent or not in the way of the data path.
+However, we lose region isolation for the operations listed above.
+The ongoing work to split console and control will give us per-region control plane, and all operations in this RFC can be handled by these per-region control planes.
+With that in mind, we accept the trade-offs outlined in this paragraph.
+
+We will also implement an "escape hatch" config generation numbers, where in a major disaster outage,
+we may manually run pageservers with a hand-selected generation number, so that we can bring them online
+independently of a control plane.
+
+### Rollout
+
+Although there is coupling between components, we may deploy most of the new data plane components
+independently of the control plane: initially they can just use a static generation number.
+
+#### Phase 1
+
+The pageserver is deployed with some special config to:
+
+- Always act like everything is generation 1 and do not wait for a control plane issued generation on attach
+- Skip the places in deletion and remote_consistent_lsn updates where we would call into control plane
+
+#### Phase 2
+
+The control plane changes are deployed: control plane will now track and increment generation numbers.
+
+#### Phase 3
+
+The pageserver is deployed with its control-plane-dependent changes enabled: it will now require
+the control plane to service re-attach requests on startup, and handle generation
+validation requests.
+
+### On-disk backward compatibility
+
+Backward compatibility with existing data is straightforward:
+
+- When reading the index, we may assume that any layer whose metadata doesn't include
+  generations will have a path without generation suffix.
+- When locating the index file on attachment, we may use the "fallback" listing path
+  and if there is only an index without generation suffix, that is the one we load.
+
+It is not necessary to re-write existing layers: even new index files will be able
+to represent generation-less layers.
+
+### On-disk forward compatibility
+
+We will do a two phase rollout, probably over multiple releases because we will naturally
+have some of the read-side code ready before the overall functionality is ready:
+
+1. Deploy pageservers which understand the new index format and generation suffixes
+   in keys, but do not write objects with generation numbers in the keys.
+2. Deploy pageservers that write objects with generation numbers in the keys.
+
+Old pageservers will be oblivious to generation numbers. That means that they can't
+read objects with generation numbers in the name. This is why we must
+first step must deploy the ability to read, before the second step
+starts writing them.
+
+# Frequently Asked Questions
+
+## Why a generation _suffix_ rather than _prefix_?
+
+The choice is motivated by object listing, since one can list by prefix but not
+suffix.
+
+In [finding remote indices](#finding-the-remote-indices-for-timelines), we rely
+on being able to do a prefix listing for `<tenant>/<timeline>/index_part.json*`.
+That relies on the prefix listing.
+
+The converse case of using a generation prefix and listing by generation is
+not needed: one could imagine listing by generation while scrubbing (so that
+a particular generation's layers could be scrubbed), but this is not part
+of normal operations, and the [scrubber](#cleaning-up-orphan-objects-scrubbing) probably won't work that way anyway.
+
+## Wouldn't it be simpler to have a separate deletion queue per timeline?
+
+Functionally speaking, we could. That's how RemoteTimelineClient currently works,
+but this approach does not map well to a long-lived persistent queue with
+generation validation.
+
+Anything we do per-timeline generates tiny random I/O, on a pageserver with
+tens of thousands of timelines operating: to be ready for high scale, we should:
+
+- A) Amortize costs where we can (e.g. a shared deletion queue)
+- B) Expect to put tenants into a quiescent state while they're not
+  busy: i.e. we shouldn't keep a tenant alive to service its deletion queue.
+
+This was discussed in the [scope](#scope) part of the deletion queue section.
+
+# Appendix A: Examples of use in high availability/failover
+
+The generation numbers proposed in this RFC are adaptable to a variety of different
+failover scenarios and models. The sections below sketch how they would work in practice.
+
+### In-place restart of a pageserver
+
+"In-place" here means that the restart is done before any other element in the system
+has taken action in response to the node being down.
+
+- After restart, the node issues a re-attach request to the control plane, and
+  receives new generation numbers for all its attached tenants.
+- Tenants may be activated with the generation number in the re-attach response.
+- If any of its attachments were in fact stale (i.e. had be reassigned to another
+  node while this node was offline), then
+  - the re-attach response will inform the tenant about this by not including
+    the tenant of this by _not_ incrementing the generation for that attachment.
+  - This will implicitly block deletions in the tenant, but as an optimization
+    the pageserver should also proactively stop doing S3 uploads when it notices this stale-generation state.
+  - The control plane is expected to eventually detach this tenant from the
+    pageserver.
+
+If the control plane does not include a tenant in the re-attach response,
+but there is still local state for the tenant in the filesystem, the pageserver
+deletes the local state in response and does not load/active the tenant.
+See the [earlier section on pageserver startup](#pageserver-attachstartup-changes) for details.
+Control plane can use this mechanism to clean up a pageserver that has been
+down for so long that all its tenants were migrated away before it came back
+up again and asked for re-attach.
+
+### Failure of a pageserver
+
+In this context, read "failure" as the most ambiguous possible case, where
+a pageserver is unavailable to clients and control plane, but may still be executing and talking
+to S3.
+
+#### Case A: re-attachment to other nodes
+
+1. Let's say node 0 becomes unresponsive in a cluster of three nodes 0, 1, 2.
+2. Some external mechanism notices that the node is unavailable and initiates
+   movement of all tenants attached to that node to a different node according
+   to some distribution rule.
+   In this example, it would mean incrementing the generation
+   of all tenants that were attached to node 0, as each tenant's assigned pageserver changes.
+3. A tenant which is now attached to node 1 will _also_ still be attached to node
+   0, from the perspective of node 0. Node 0 will still be using its old generation,
+   node 1 will be using a newer generation.
+4. S3 writes will continue from nodes 0 and 1: there will be an index_part.json-00000001
+   \_and\* an index_part.json-00000002. Objects written under the old suffix
+   after the new attachment was created do not matter from the rest of the system's
+   perspective: the endpoints are reading from the new attachment location. Objects
+   written by node 0 are just garbage that can be cleaned up at leisure. Node 0 will
+   not do any deletions because it can't synchronize with control plane, or if it could,
+   its deletion queue processing would get errors for the validation requests.
+
+#### Case B: direct node replacement with same node_id and drive
+
+This is the scenario we would experience if running pageservers in some dynamic
+VM/container environment that would auto-replace a given node_id when it became
+unresponsive, with the node's storage supplied by some network block device
+that is attached to the replacement VM/container.
+
+1. Let's say node 0 fails, and there may be some other peers but they aren't relevant.
+2. Some external mechanism notices that the node is unavailable, and creates
+   a "new node 0" (Node 0b) which is a physically separate server. The original node 0
+   (Node 0a) may still be running, because we do not assume the environment fences nodes.
+3. On startup, node 0b re-attaches and gets higher generation numbers for
+   all tenants.
+4. S3 writes continue from nodes 0a and 0b, but the writes do not collide due to different
+   generation in the suffix, and the writes from node 0a are not visible to the rest
+   of the system because endpoints are reading only from node 0b.
+
+# Appendix B: interoperability with other features
+
+## Sharded Keyspace
+
+The design in this RFC maps neatly to a sharded keyspace design where subsets of the key space
+for a tenant are assigned to different pageservers:
+
+- the "unit of work" for attachments becomes something like a TenantShard rather than a Tenant
+- TenantShards get generation numbers just as Tenants do.
+- Write workload (ingest, compaction) for a tenant is spread out across pageservers via
+  TenantShards, but each TenantShard still has exactly one valid writer at a time.
+
+## Read replicas
+
+_This section is about a passive reader of S3 pageserver state, not a postgres
+read replica_
+
+For historical reads to LSNs below the remote persistent LSN, any node may act as a reader at any
+time: remote data is logically immutable data, and the use of deferred deletion in this RFC helps
+mitigate the fact that remote data is not _physically_ immutable (i.e. the actual data for a given
+page moves around as compaction happens).
+
+A read replica needs to be aware of generations in remote data in order to read the latest
+metadata (find the index_part.json with the latest suffix). It may either query this
+from the control plane, or find it with ListObjectsv2 request
+
+## Seamless migration
+
+To make tenant migration totally seamless, we will probably want to intentionally double-attach
+a tenant briefly, serving reads from the old node while waiting for the new node to be ready.
+
+This RFC enables that double-attachment: two nodes may be attached at the same time, with the migration destination
+having a higher generation number. The old node will be able to ingest and serve reads, but not
+do any deletes. The new node's attachment must also avoid deleting layers that the old node may
+still use. A new piece of state
+will be needed for this in the control plane's definition of an attachment.
+
+## Warm secondary locations
+
+To enable faster tenant movement after a pageserver is lost, we will probably want to spend some
+disk capacity on keeping standby locations populated with local disk data.
+
+There's no conflict between this RFC and that: implementing warm secondary locations on a per-tenant basis
+would be a separate change to the control plane to store standby location(s) for a tenant. Because
+the standbys do not write to S3, they do not need to be assigned generation numbers. When a tenant is
+re-attached to a standby location, that would increment the tenant attachment generation and this
+would work the same as any other attachment change, but with a warm cache.
+
+## Ephemeral node IDs
+
+This RFC intentionally avoids changing anything fundamental about how pageservers are identified
+and registered with the control plane, to avoid coupling the implementation of pageserver split
+brain protection with more fundamental changes in the management of the pageservers.
+
+Moving to ephemeral node IDs would provide an extra layer of
+resilience in the system, as it would prevent the control plane
+accidentally attaching to two physical nodes with the same
+generation, if somehow there were two physical nodes with
+the same node IDs (currently we rely on EC2 guarantees to
+eliminate this scenario). With ephemeral node IDs, there would be
+no possibility of that happening, no matter the behavior of
+underlying infrastructure.
+
+Nothing fundamental in the pageserver's handling of generations needs to change to handle ephemeral node IDs, since we hardly use the
+`node_id` anywhere. The `/re-attach` API would be extended
+to enable the pageserver to obtain its ephemeral ID, and provide
+some correlation identifier (e.g. EC instance ID), to help the
+control plane re-attach tenants to the same physical server that
+previously had them attached.
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -89,6 +89,8 @@ impl RemoteExtSpec {
        &self,
        ext_name: &str,
        is_library: bool,
+        build_tag: &str,
+        pg_major_version: &str,
    ) -> anyhow::Result<(String, RemotePath)> {
        let mut real_ext_name = ext_name;
        if is_library {
@@ -104,11 +106,32 @@ impl RemoteExtSpec {
                .ok_or(anyhow::anyhow!("library {} is not found", lib_raw_name))?;
        }

+        // Check if extension is present in public or custom.
+        // If not, then it is not allowed to be used by this compute.
+        if let Some(public_extensions) = &self.public_extensions {
+            if !public_extensions.contains(&real_ext_name.to_string()) {
+                if let Some(custom_extensions) = &self.custom_extensions {
+                    if !custom_extensions.contains(&real_ext_name.to_string()) {
+                        return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
+                    }
+                }
+            }
+        }
+
        match self.extension_data.get(real_ext_name) {
-            Some(ext_data) => Ok((
-                real_ext_name.to_string(),
-                RemotePath::from_string(&ext_data.archive_path)?,
-            )),
+            Some(_ext_data) => {
+                // Construct the path to the extension archive
+                // BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
+                //
+                // Keep it in sync with path generation in
+                // https://github.com/neondatabase/build-custom-extensions/tree/main
+                let archive_path_str =
+                    format!("{build_tag}/{pg_major_version}/extensions/{real_ext_name}.tar.zst");
+                Ok((
+                    real_ext_name.to_string(),
+                    RemotePath::from_string(&archive_path_str)?,
+                ))
+            }
            None => Err(anyhow::anyhow!(
                "real_ext_name {} is not found",
                real_ext_name
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -31,6 +31,8 @@ fn lsn_invalid() -> Lsn {
 #[serde_as]
 #[derive(Debug, Clone, Deserialize, Serialize)]
 pub struct SkTimelineInfo {
+    /// Term.
+    pub term: Option<u64>,
    /// Term of the last entry.
    pub last_log_term: Option<u64>,
    /// LSN of the last record.
@@ -58,4 +60,6 @@ pub struct SkTimelineInfo {
    /// A connection string to use for WAL receiving.
    #[serde(default)]
    pub safekeeper_connstr: Option<String>,
+    #[serde(default)]
+    pub http_connstr: Option<String>,
 }
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -26,6 +26,7 @@ serde_json.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
+tokio-util.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
@@ -37,6 +38,7 @@ url.workspace = true
 uuid.workspace = true

 pq_proto.workspace = true
+postgres_connection.workspace = true
 metrics.workspace = true
 workspace_hack.workspace = true

--- a/libs/utils/src/backoff.rs
+++ b/libs/utils/src/backoff.rs
@@ -1,18 +1,31 @@
 use std::fmt::{Debug, Display};

 use futures::Future;
+use tokio_util::sync::CancellationToken;

 pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
 pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;

-pub async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
+pub async fn exponential_backoff(
+    n: u32,
+    base_increment: f64,
+    max_seconds: f64,
+    cancel: &CancellationToken,
+) {
    let backoff_duration_seconds =
        exponential_backoff_duration_seconds(n, base_increment, max_seconds);
    if backoff_duration_seconds > 0.0 {
        tracing::info!(
            "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
        );
-        tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
+
+        drop(
+            tokio::time::timeout(
+                std::time::Duration::from_secs_f64(backoff_duration_seconds),
+                cancel.cancelled(),
+            )
+            .await,
+        )
    }
 }

@@ -24,28 +37,57 @@ pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_sec
    }
 }

+/// Configure cancellation for a retried operation: when to cancel (the token), and
+/// what kind of error to return on cancellation
+pub struct Cancel<E, CF>
+where
+    E: Display + Debug + 'static,
+    CF: Fn() -> E,
+{
+    token: CancellationToken,
+    on_cancel: CF,
+}
+
+impl<E, CF> Cancel<E, CF>
+where
+    E: Display + Debug + 'static,
+    CF: Fn() -> E,
+{
+    pub fn new(token: CancellationToken, on_cancel: CF) -> Self {
+        Self { token, on_cancel }
+    }
+}
+
 /// retries passed operation until one of the following conditions are met:
 /// Encountered error is considered as permanent (non-retryable)
 /// Retries have been exhausted.
 /// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
 /// When attempts cross `warn_threshold` function starts to emit log warnings.
 /// `description` argument is added to log messages. Its value should identify the `op` is doing
-pub async fn retry<T, O, F, E>(
+/// `cancel` argument is required: any time we are looping on retry, we should be using a CancellationToken
+/// to drop out promptly on shutdown.
+pub async fn retry<T, O, F, E, CF>(
    mut op: O,
    is_permanent: impl Fn(&E) -> bool,
    warn_threshold: u32,
    max_retries: u32,
    description: &str,
+    cancel: Cancel<E, CF>,
 ) -> Result<T, E>
 where
    // Not std::error::Error because anyhow::Error doesnt implement it.
    // For context see https://github.com/dtolnay/anyhow/issues/63
-    E: Display + Debug,
+    E: Display + Debug + 'static,
    O: FnMut() -> F,
    F: Future<Output = Result<T, E>>,
+    CF: Fn() -> E,
 {
    let mut attempts = 0;
    loop {
+        if cancel.token.is_cancelled() {
+            return Err((cancel.on_cancel)());
+        }
+
        let result = op().await;
        match result {
            Ok(_) => {
@@ -80,6 +122,7 @@ where
            attempts,
            DEFAULT_BASE_BACKOFF_SECONDS,
            DEFAULT_MAX_BACKOFF_SECONDS,
+            &cancel.token,
        )
        .await;
        attempts += 1;
@@ -132,6 +175,7 @@ mod tests {
            1,
            1,
            "work",
+            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
        )
        .await;

@@ -157,6 +201,7 @@ mod tests {
            2,
            2,
            "work",
+            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
        )
        .await
        .unwrap();
@@ -179,6 +224,7 @@ mod tests {
            2,
            2,
            "work",
+            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
        )
        .await
        .unwrap_err();
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -0,0 +1,113 @@
+use std::fmt::Debug;
+
+use serde::{Deserialize, Serialize};
+
+/// Tenant generations are used to provide split-brain safety and allow
+/// multiple pageservers to attach the same tenant concurrently.
+///
+/// See docs/rfcs/025-generation-numbers.md for detail on how generation
+/// numbers are used.
+#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
+pub enum Generation {
+    // Generations with this magic value will not add a suffix to S3 keys, and will not
+    // be included in persisted index_part.json.  This value is only to be used
+    // during migration from pre-generation metadata to generation-aware metadata,
+    // and should eventually go away.
+    //
+    // A special Generation is used rather than always wrapping Generation in an Option,
+    // so that code handling generations doesn't have to be aware of the legacy
+    // case everywhere it touches a generation.
+    None,
+    // Generations with this magic value may never be used to construct S3 keys:
+    // we will panic if someone tries to.  This is for Tenants in the "Broken" state,
+    // so that we can satisfy their constructor with a Generation without risking
+    // a code bug using it in an S3 write (broken tenants should never write)
+    Broken,
+    Valid(u32),
+}
+
+/// The Generation type represents a number associated with a Tenant, which
+/// increments every time the tenant is attached to a new pageserver, or
+/// an attached pageserver restarts.
+///
+/// It is included as a suffix in S3 keys, as a protection against split-brain
+/// scenarios where pageservers might otherwise issue conflicting writes to
+/// remote storage
+impl Generation {
+    /// Create a new Generation that represents a legacy key format with
+    /// no generation suffix
+    pub fn none() -> Self {
+        Self::None
+    }
+
+    // Create a new generation that will panic if you try to use get_suffix
+    pub fn broken() -> Self {
+        Self::Broken
+    }
+
+    pub fn new(v: u32) -> Self {
+        Self::Valid(v)
+    }
+
+    pub fn is_none(&self) -> bool {
+        matches!(self, Self::None)
+    }
+
+    pub fn get_suffix(&self) -> String {
+        match self {
+            Self::Valid(v) => {
+                format!("-{:08x}", v)
+            }
+            Self::None => "".into(),
+            Self::Broken => {
+                panic!("Tried to use a broken generation");
+            }
+        }
+    }
+}
+
+impl Serialize for Generation {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if let Self::Valid(v) = self {
+            v.serialize(serializer)
+        } else {
+            // We should never be asked to serialize a None or Broken.  Structures
+            // that include an optional generation should convert None to an
+            // Option<Generation>::None
+            Err(serde::ser::Error::custom(
+                "Tried to serialize invalid generation ({self})",
+            ))
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for Generation {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        Ok(Self::Valid(u32::deserialize(deserializer)?))
+    }
+}
+
+// We intentionally do not implement Display for Generation, to reduce the
+// risk of a bug where the generation is used in a format!() string directly
+// instead of using get_suffix().
+impl Debug for Generation {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Valid(v) => {
+                write!(f, "{:08x}", v)
+            }
+            Self::None => {
+                write!(f, "<none>")
+            }
+            Self::Broken => {
+                write!(f, "<broken>")
+            }
+        }
+    }
+}
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -27,6 +27,9 @@ pub mod id;
 // http endpoint utils
 pub mod http;

+// definition of the Generation type for pageserver attachment APIs
+pub mod generation;
+
 // common log initialisation routine
 pub mod logging;

@@ -58,6 +61,8 @@ pub mod serde_regex;

 pub mod pageserver_feedback;

+pub mod postgres_client;
+
 pub mod tracing_span_assert;

 pub mod rate_limit;
--- a/libs/utils/src/postgres_client.rs
+++ b/libs/utils/src/postgres_client.rs
@@ -0,0 +1,37 @@
+//! Postgres client connection code common to other crates (safekeeper and
+//! pageserver) which depends on tenant/timeline ids and thus not fitting into
+//! postgres_connection crate.
+
+use anyhow::Context;
+use postgres_connection::{parse_host_port, PgConnectionConfig};
+
+use crate::id::TenantTimelineId;
+
+/// Create client config for fetching WAL from safekeeper on particular timeline.
+/// listen_pg_addr_str is in form host:\[port\].
+pub fn wal_stream_connection_config(
+    TenantTimelineId {
+        tenant_id,
+        timeline_id,
+    }: TenantTimelineId,
+    listen_pg_addr_str: &str,
+    auth_token: Option<&str>,
+    availability_zone: Option<&str>,
+) -> anyhow::Result<PgConnectionConfig> {
+    let (host, port) =
+        parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
+    let port = port.unwrap_or(5432);
+    let mut connstr = PgConnectionConfig::new_host_port(host, port)
+        .extend_options([
+            "-c".to_owned(),
+            format!("timeline_id={}", timeline_id),
+            format!("tenant_id={}", tenant_id),
+        ])
+        .set_password(auth_token.map(|s| s.to_owned()));
+
+    if let Some(availability_zone) = availability_zone {
+        connstr = connstr.extend_options([format!("availability_zone={}", availability_zone)]);
+    }
+
+    Ok(connstr)
+}
--- a/libs/vm_monitor/Cargo.toml
+++ b/libs/vm_monitor/Cargo.toml
@@ -0,0 +1,31 @@
+[package]
+name = "vm_monitor"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[[bin]]
+name = "vm-monitor"
+path = "./src/bin/monitor.rs"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+anyhow.workspace = true
+axum.workspace = true
+clap.workspace = true
+futures.workspace = true
+inotify.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+sysinfo.workspace = true
+tokio.workspace = true
+tokio-postgres.workspace = true
+tokio-stream.workspace = true
+tokio-util.workspace = true
+tracing.workspace = true
+tracing-subscriber.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+
+[target.'cfg(target_os = "linux")'.dependencies]
+cgroups-rs = "0.3.3"
--- a/libs/vm_monitor/README.md
+++ b/libs/vm_monitor/README.md
@@ -0,0 +1,34 @@
+# `vm-monitor`
+
+The `vm-monitor` (or just monitor) is a core component of the autoscaling system,
+along with the `autoscale-scheduler` and the `autoscaler-agent`s. The monitor has
+two primary roles: 1) notifying agents when immediate upscaling is necessary due
+to memory conditions and 2) managing Postgres' file cache and a cgroup to carry
+out upscaling and downscaling decisions.
+
+## More on scaling
+
+We scale CPU and memory using NeonVM, our in-house QEMU tool for use with Kubernetes.
+To control thresholds for receiving memory usage notifications, we start Postgres
+in the `neon-postgres` cgroup and set its `memory.{max,high}`.
+
+* See also: [`neondatabase/autoscaling`](https://github.com/neondatabase/autoscaling/)
+* See also: [`neondatabase/vm-monitor`](https://github.com/neondatabase/vm-monitor/),
+where initial development of the monitor happened. The repository is no longer
+maintained but the commit history may be useful for debugging.
+
+## Structure
+
+The `vm-monitor` is loosely comprised of a few systems. These are:
+* the server: this is just a simple `axum` server that accepts requests and
+upgrades them to websocket connections. The server only allows one connection at
+a time. This means that upon receiving a new connection, the server will terminate
+and old one if it exists.
+* the filecache: a struct that allows communication with the Postgres file cache.
+On startup, we connect to the filecache and hold on to the connection for the
+entire monitor lifetime.
+* the cgroup watcher: the `CgroupWatcher` manages the `neon-postgres` cgroup by
+listening for `memory.high` events and setting its `memory.{high,max}` values.
+* the runner: the runner marries the filecache and cgroup watcher together,
+communicating with the agent throught the `Dispatcher`, and then calling filecache
+and cgroup watcher functions as needed to upscale and downscale
--- a/libs/vm_monitor/src/bin/monitor.rs
+++ b/libs/vm_monitor/src/bin/monitor.rs
@@ -0,0 +1,33 @@
+// We expose a standalone binary _and_ start the monitor in `compute_ctl` so that
+// we can test the monitor as part of the entire autoscaling system in
+// neondatabase/autoscaling.
+//
+// The monitor was previously started by vm-builder, and for testing purposes,
+// we can mimic that setup with this binary.
+
+#[cfg(target_os = "linux")]
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    use clap::Parser;
+    use tokio_util::sync::CancellationToken;
+    use tracing_subscriber::EnvFilter;
+    use vm_monitor::Args;
+
+    let subscriber = tracing_subscriber::fmt::Subscriber::builder()
+        .json()
+        .with_file(true)
+        .with_line_number(true)
+        .with_span_list(true)
+        .with_env_filter(EnvFilter::from_default_env())
+        .finish();
+    tracing::subscriber::set_global_default(subscriber)?;
+
+    let args: &'static Args = Box::leak(Box::new(Args::parse()));
+    let token = CancellationToken::new();
+    vm_monitor::start(args, token).await
+}
+
+#[cfg(not(target_os = "linux"))]
+fn main() {
+    panic!("the monitor requires cgroups, which are only available on linux")
+}
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -0,0 +1,693 @@
+use std::{
+    fmt::{Debug, Display},
+    fs,
+    pin::pin,
+    sync::atomic::{AtomicU64, Ordering},
+};
+
+use anyhow::{anyhow, bail, Context};
+use cgroups_rs::{
+    freezer::FreezerController,
+    hierarchies::{self, is_cgroup2_unified_mode, UNIFIED_MOUNTPOINT},
+    memory::MemController,
+    MaxValue,
+    Subsystem::{Freezer, Mem},
+};
+use inotify::{EventStream, Inotify, WatchMask};
+use tokio::sync::mpsc::{self, error::TryRecvError};
+use tokio::time::{Duration, Instant};
+use tokio_stream::{Stream, StreamExt};
+use tracing::{info, warn};
+
+use crate::protocol::Resources;
+use crate::MiB;
+
+/// Monotonically increasing counter of the number of memory.high events
+/// the cgroup has experienced.
+///
+/// We use this to determine if a modification to the `memory.events` file actually
+/// changed the `high` field. If not, we don't care about the change. When we
+/// read the file, we check the `high` field in the file against `MEMORY_EVENT_COUNT`
+/// to see if it changed since last time.
+pub static MEMORY_EVENT_COUNT: AtomicU64 = AtomicU64::new(0);
+
+/// Monotonically increasing counter that gives each cgroup event a unique id.
+///
+/// This allows us to answer questions like "did this upscale arrive before this
+/// memory.high?". This static is also used by the `Sequenced` type to "tag" values
+/// with a sequence number. As such, prefer to used the `Sequenced` type rather
+/// than this static directly.
+static EVENT_SEQUENCE_NUMBER: AtomicU64 = AtomicU64::new(0);
+
+/// A memory event type reported in memory.events.
+#[derive(Debug, Eq, PartialEq, Copy, Clone)]
+pub enum MemoryEvent {
+    Low,
+    High,
+    Max,
+    Oom,
+    OomKill,
+    OomGroupKill,
+}
+
+impl MemoryEvent {
+    fn as_str(&self) -> &str {
+        match self {
+            MemoryEvent::Low => "low",
+            MemoryEvent::High => "high",
+            MemoryEvent::Max => "max",
+            MemoryEvent::Oom => "oom",
+            MemoryEvent::OomKill => "oom_kill",
+            MemoryEvent::OomGroupKill => "oom_group_kill",
+        }
+    }
+}
+
+impl Display for MemoryEvent {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+
+/// Configuration for a `CgroupWatcher`
+#[derive(Debug, Clone)]
+pub struct Config {
+    // The target difference between the total memory reserved for the cgroup
+    // and the value of the cgroup's memory.high.
+    //
+    // In other words, memory.high + oom_buffer_bytes will equal the total memory that the cgroup may
+    // use (equal to system memory, minus whatever's taken out for the file cache).
+    oom_buffer_bytes: u64,
+
+    // The amount of memory, in bytes, below a proposed new value for
+    // memory.high that the cgroup's memory usage must be for us to downscale
+    //
+    // In other words, we can downscale only when:
+    //
+    //   memory.current + memory_high_buffer_bytes < (proposed) memory.high
+    //
+    // TODO: there's some minor issues with this approach -- in particular, that we might have
+    // memory in use by the kernel's page cache that we're actually ok with getting rid of.
+    pub(crate) memory_high_buffer_bytes: u64,
+
+    // The maximum duration, in milliseconds, that we're allowed to pause
+    // the cgroup for while waiting for the autoscaler-agent to upscale us
+    max_upscale_wait: Duration,
+
+    // The required minimum time, in milliseconds, that we must wait before re-freezing
+    // the cgroup while waiting for the autoscaler-agent to upscale us.
+    do_not_freeze_more_often_than: Duration,
+
+    // The amount of memory, in bytes, that we should periodically increase memory.high
+    // by while waiting for the autoscaler-agent to upscale us.
+    //
+    // This exists to avoid the excessive throttling that happens when a cgroup is above its
+    // memory.high for too long. See more here:
+    // https://github.com/neondatabase/autoscaling/issues/44#issuecomment-1522487217
+    memory_high_increase_by_bytes: u64,
+
+    // The period, in milliseconds, at which we should repeatedly increase the value
+    // of the cgroup's memory.high while we're waiting on upscaling and memory.high
+    // is still being hit.
+    //
+    // Technically speaking, this actually serves as a rate limit to moderate responding to
+    // memory.high events, but these are roughly equivalent if the process is still allocating
+    // memory.
+    memory_high_increase_every: Duration,
+}
+
+impl Config {
+    /// Calculate the new value for the cgroups memory.high based on system memory
+    pub fn calculate_memory_high_value(&self, total_system_mem: u64) -> u64 {
+        total_system_mem.saturating_sub(self.oom_buffer_bytes)
+    }
+}
+
+impl Default for Config {
+    fn default() -> Self {
+        Self {
+            oom_buffer_bytes: 100 * MiB,
+            memory_high_buffer_bytes: 100 * MiB,
+            // while waiting for upscale, don't freeze for more than 20ms every 1s
+            max_upscale_wait: Duration::from_millis(20),
+            do_not_freeze_more_often_than: Duration::from_millis(1000),
+            // while waiting for upscale, increase memory.high by 10MiB every 25ms
+            memory_high_increase_by_bytes: 10 * MiB,
+            memory_high_increase_every: Duration::from_millis(25),
+        }
+    }
+}
+
+/// Used to represent data that is associated with a certain point in time, such
+/// as an upscale request or memory.high event.
+///
+/// Internally, creating a `Sequenced` uses a static atomic counter to obtain
+/// a unique sequence number. Sequence numbers are monotonically increasing,
+/// allowing us to answer questions like "did this upscale happen after this
+/// memory.high event?" by comparing the sequence numbers of the two events.
+#[derive(Debug, Clone)]
+pub struct Sequenced<T> {
+    seqnum: u64,
+    data: T,
+}
+
+impl<T> Sequenced<T> {
+    pub fn new(data: T) -> Self {
+        Self {
+            seqnum: EVENT_SEQUENCE_NUMBER.fetch_add(1, Ordering::AcqRel),
+            data,
+        }
+    }
+}
+
+/// Responds to `MonitorEvents` to manage the cgroup: preventing it from being
+/// OOM killed or throttling.
+///
+/// The `CgroupWatcher` primarily achieves this by reading from a stream of
+/// `MonitorEvent`s. See `main_signals_loop` for details on how to keep the
+/// cgroup happy.
+#[derive(Debug)]
+pub struct CgroupWatcher {
+    pub config: Config,
+
+    /// The sequence number of the last upscale.
+    ///
+    /// If we receive a memory.high event that has a _lower_ sequence number than
+    /// `last_upscale_seqnum`, then we know it occured before the upscale, and we
+    /// can safely ignore it.
+    ///
+    /// Note: Like the `events` field, this doesn't _need_ interior mutability but we
+    /// use it anyways so that methods take `&self`, not `&mut self`.
+    last_upscale_seqnum: AtomicU64,
+
+    /// A channel on which we send messages to request upscale from the dispatcher.
+    upscale_requester: mpsc::Sender<()>,
+
+    /// The actual cgroup we are watching and managing.
+    cgroup: cgroups_rs::Cgroup,
+}
+
+/// Read memory.events for the desired event type.
+///
+/// `path` specifies the path to the desired `memory.events` file.
+/// For more info, see the `memory.events` section of the [kernel docs]
+/// <https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files>
+fn get_event_count(path: &str, event: MemoryEvent) -> anyhow::Result<u64> {
+    let contents = fs::read_to_string(path)
+        .with_context(|| format!("failed to read memory.events from {path}"))?;
+
+    // Then contents of the file look like:
+    // low 42
+    // high 101
+    // ...
+    contents
+        .lines()
+        .filter_map(|s| s.split_once(' '))
+        .find(|(e, _)| *e == event.as_str())
+        .ok_or_else(|| anyhow!("failed to find entry for memory.{event} events in {path}"))
+        .and_then(|(_, count)| {
+            count
+                .parse::<u64>()
+                .with_context(|| format!("failed to parse memory.{event} as u64"))
+        })
+}
+
+/// Create an event stream that produces events whenever the file at the provided
+/// path is modified.
+fn create_file_watcher(path: &str) -> anyhow::Result<EventStream<[u8; 1024]>> {
+    info!("creating file watcher for {path}");
+    let inotify = Inotify::init().context("failed to initialize file watcher")?;
+    inotify
+        .watches()
+        .add(path, WatchMask::MODIFY)
+        .with_context(|| format!("failed to start watching {path}"))?;
+    inotify
+        // The inotify docs use [0u8; 1024] so we'll just copy them. We only need
+        // to store one event at a time - if the event gets written over, that's
+        // ok. We still see that there is an event. For more information, see:
+        // https://man7.org/linux/man-pages/man7/inotify.7.html
+        .into_event_stream([0u8; 1024])
+        .context("failed to start inotify event stream")
+}
+
+impl CgroupWatcher {
+    /// Create a new `CgroupWatcher`.
+    #[tracing::instrument(skip_all, fields(%name))]
+    pub fn new(
+        name: String,
+        // A channel on which to send upscale requests
+        upscale_requester: mpsc::Sender<()>,
+    ) -> anyhow::Result<(Self, impl Stream<Item = Sequenced<u64>>)> {
+        // TODO: clarify exactly why we need v2
+        // Make sure cgroups v2 (aka unified) are supported
+        if !is_cgroup2_unified_mode() {
+            anyhow::bail!("cgroups v2 not supported");
+        }
+        let cgroup = cgroups_rs::Cgroup::load(hierarchies::auto(), &name);
+
+        // Start monitoring the cgroup for memory events. In general, for
+        // cgroups v2 (aka unified), metrics are reported in files like
+        // > `/sys/fs/cgroup/{name}/{metric}`
+        // We are looking for `memory.high` events, which are stored in the
+        // file `memory.events`. For more info, see the `memory.events` section
+        // of https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files
+        let path = format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name);
+        let memory_events = create_file_watcher(&path)
+            .with_context(|| format!("failed to create event watcher for {path}"))?
+            // This would be nice with with .inspect_err followed by .ok
+            .filter_map(move |_| match get_event_count(&path, MemoryEvent::High) {
+                Ok(high) => Some(high),
+                Err(error) => {
+                    // TODO: Might want to just panic here
+                    warn!(?error, "failed to read high events count from {}", &path);
+                    None
+                }
+            })
+            // Only report the event if the memory.high count increased
+            .filter_map(|high| {
+                if MEMORY_EVENT_COUNT.fetch_max(high, Ordering::AcqRel) < high {
+                    Some(high)
+                } else {
+                    None
+                }
+            })
+            .map(Sequenced::new);
+
+        let initial_count = get_event_count(
+            &format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name),
+            MemoryEvent::High,
+        )?;
+
+        info!(initial_count, "initial memory.high event count");
+
+        // Hard update `MEMORY_EVENT_COUNT` since there could have been processes
+        // running in the cgroup before that caused it to be non-zero.
+        MEMORY_EVENT_COUNT.fetch_max(initial_count, Ordering::AcqRel);
+
+        Ok((
+            Self {
+                cgroup,
+                upscale_requester,
+                last_upscale_seqnum: AtomicU64::new(0),
+                config: Default::default(),
+            },
+            memory_events,
+        ))
+    }
+
+    /// The entrypoint for the `CgroupWatcher`.
+    #[tracing::instrument(skip_all)]
+    pub async fn watch<E>(
+        &self,
+        // These are ~dependency injected~ (fancy, I know) because this function
+        // should never return.
+        // -> therefore: when we tokio::spawn it, we don't await the JoinHandle.
+        // -> therefore: if we want to stick it in an Arc so many threads can access
+        //    it, methods can never take mutable access.
+        //     - note: we use the Arc strategy so that a) we can call this function
+        //             right here and b) the runner can call the set/get_memory methods
+        // -> since calling recv() on a tokio::sync::mpsc::Receiver takes &mut self,
+        //    we just pass them in here instead of holding them in fields, as that
+        //    would require this method to take &mut self.
+        mut upscales: mpsc::Receiver<Sequenced<Resources>>,
+        events: E,
+    ) -> anyhow::Result<()>
+    where
+        E: Stream<Item = Sequenced<u64>>,
+    {
+        // There are several actions might do when receiving a `memory.high`,
+        // such as freezing the cgroup, or increasing its `memory.high`. We don't
+        // want to do these things too often (because postgres needs to run, and
+        // we only have so much memory). These timers serve as rate limits for this.
+        let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO));
+        let mut wait_to_increase_memory_high = pin!(tokio::time::sleep(Duration::ZERO));
+        let mut events = pin!(events);
+
+        // Are we waiting to be upscaled? Could be true if we request upscale due
+        // to a memory.high event and it does not arrive in time.
+        let mut waiting_on_upscale = false;
+
+        loop {
+            tokio::select! {
+                upscale = upscales.recv() => {
+                    let Sequenced { seqnum, data } = upscale
+                        .context("failed to listen on upscale notification channel")?;
+                    self.last_upscale_seqnum.store(seqnum, Ordering::Release);
+                    info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
+                }
+                event = events.next() => {
+                    let Some(Sequenced { seqnum, .. }) = event else {
+                        bail!("failed to listen for memory.high events")
+                    };
+                    // The memory.high came before our last upscale, so we consider
+                    // it resolved
+                    if self.last_upscale_seqnum.fetch_max(seqnum, Ordering::AcqRel) > seqnum {
+                        info!(
+                            "received memory.high event, but it came before our last upscale -> ignoring it"
+                        );
+                        continue;
+                    }
+
+                    // The memory.high came after our latest upscale. We don't
+                    // want to do anything yet, so peek the next event in hopes
+                    // that it's an upscale.
+                    if let Some(upscale_num) = self
+                        .upscaled(&mut upscales)
+                        .context("failed to check if we were upscaled")?
+                    {
+                        if upscale_num > seqnum {
+                            info!(
+                                "received memory.high event, but it came before our last upscale -> ignoring it"
+                            );
+                            continue;
+                        }
+                    }
+
+                    // If it's been long enough since we last froze, freeze the
+                    // cgroup and request upscale
+                    if wait_to_freeze.is_elapsed() {
+                        info!("received memory.high event -> requesting upscale");
+                        waiting_on_upscale = self
+                            .handle_memory_high_event(&mut upscales)
+                            .await
+                            .context("failed to handle upscale")?;
+                        wait_to_freeze
+                            .as_mut()
+                            .reset(Instant::now() + self.config.do_not_freeze_more_often_than);
+                        continue;
+                    }
+
+                    // Ok, we can't freeze, just request upscale
+                    if !waiting_on_upscale {
+                        info!("received memory.high event, but too soon to refreeze -> requesting upscale");
+
+                        // Make check to make sure we haven't been upscaled in the
+                        // meantine (can happen if the agent independently decides
+                        // to upscale us again)
+                        if self
+                            .upscaled(&mut upscales)
+                            .context("failed to check if we were upscaled")?
+                            .is_some()
+                        {
+                            info!("no need to request upscaling because we got upscaled");
+                            continue;
+                        }
+                        self.upscale_requester
+                            .send(())
+                            .await
+                            .context("failed to request upscale")?;
+                        continue;
+                    }
+
+                    // Shoot, we can't freeze or and we're still waiting on upscale,
+                    // increase memory.high to reduce throttling
+                    if wait_to_increase_memory_high.is_elapsed() {
+                        info!(
+                            "received memory.high event, \
+                            but too soon to refreeze and already requested upscale \
+                            -> increasing memory.high"
+                        );
+
+                        // Make check to make sure we haven't been upscaled in the
+                        // meantine (can happen if the agent independently decides
+                        // to upscale us again)
+                        if self
+                            .upscaled(&mut upscales)
+                            .context("failed to check if we were upscaled")?
+                            .is_some()
+                        {
+                            info!("no need to increase memory.high because got upscaled");
+                            continue;
+                        }
+
+                        // Request upscale anyways (the agent will handle deduplicating
+                        // requests)
+                        self.upscale_requester
+                            .send(())
+                            .await
+                            .context("failed to request upscale")?;
+
+                        let memory_high =
+                            self.get_high_bytes().context("failed to get memory.high")?;
+                        let new_high = memory_high + self.config.memory_high_increase_by_bytes;
+                        info!(
+                            current_high_bytes = memory_high,
+                            new_high_bytes = new_high,
+                            "updating memory.high"
+                        );
+                        self.set_high_bytes(new_high)
+                            .context("failed to set memory.high")?;
+                        wait_to_increase_memory_high
+                            .as_mut()
+                            .reset(Instant::now() + self.config.memory_high_increase_every)
+                    }
+
+                    // we can't do anything
+                }
+            };
+        }
+    }
+
+    /// Handle a `memory.high`, returning whether we are still waiting on upscale
+    /// by the time the function returns.
+    ///
+    /// The general plan for handling a `memory.high` event is as follows:
+    /// 1. Freeze the cgroup
+    /// 2. Start a timer for `self.config.max_upscale_wait`
+    /// 3. Request upscale
+    /// 4. After the timer elapses or we receive upscale, thaw the cgroup.
+    /// 5. Return whether or not we are still waiting for upscale. If we are,
+    ///    we'll increase the cgroups memory.high to avoid getting oom killed
+    #[tracing::instrument(skip_all)]
+    async fn handle_memory_high_event(
+        &self,
+        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+    ) -> anyhow::Result<bool> {
+        // Immediately freeze the cgroup before doing anything else.
+        info!("received memory.high event -> freezing cgroup");
+        self.freeze().context("failed to freeze cgroup")?;
+
+        // We'll use this for logging durations
+        let start_time = Instant::now();
+
+        // Await the upscale until we have to unfreeze
+        let timed =
+            tokio::time::timeout(self.config.max_upscale_wait, self.await_upscale(upscales));
+
+        // Request the upscale
+        info!(
+            wait = ?self.config.max_upscale_wait,
+            "sending request for immediate upscaling",
+        );
+        self.upscale_requester
+            .send(())
+            .await
+            .context("failed to request upscale")?;
+
+        let waiting_on_upscale = match timed.await {
+            Ok(Ok(())) => {
+                info!(elapsed = ?start_time.elapsed(), "received upscale in time");
+                false
+            }
+            // **important**: unfreeze the cgroup before ?-reporting the error
+            Ok(Err(e)) => {
+                info!("error waiting for upscale -> thawing cgroup");
+                self.thaw()
+                    .context("failed to thaw cgroup after errored waiting for upscale")?;
+                Err(e.context("failed to await upscale"))?
+            }
+            Err(_) => {
+                info!(elapsed = ?self.config.max_upscale_wait, "timed out waiting for upscale");
+                true
+            }
+        };
+
+        info!("thawing cgroup");
+        self.thaw().context("failed to thaw cgroup")?;
+
+        Ok(waiting_on_upscale)
+    }
+
+    /// Checks whether we were just upscaled, returning the upscale's sequence
+    /// number if so.
+    #[tracing::instrument(skip_all)]
+    fn upscaled(
+        &self,
+        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+    ) -> anyhow::Result<Option<u64>> {
+        let Sequenced { seqnum, data } = match upscales.try_recv() {
+            Ok(upscale) => upscale,
+            Err(TryRecvError::Empty) => return Ok(None),
+            Err(TryRecvError::Disconnected) => {
+                bail!("upscale notification channel was disconnected")
+            }
+        };
+
+        // Make sure to update the last upscale sequence number
+        self.last_upscale_seqnum.store(seqnum, Ordering::Release);
+        info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
+        Ok(Some(seqnum))
+    }
+
+    /// Await an upscale event, discarding any `memory.high` events received in
+    /// the process.
+    ///
+    /// This is used in `handle_memory_high_event`, where we need to listen
+    /// for upscales in particular so we know if we can thaw the cgroup early.
+    #[tracing::instrument(skip_all)]
+    async fn await_upscale(
+        &self,
+        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+    ) -> anyhow::Result<()> {
+        let Sequenced { seqnum, .. } = upscales
+            .recv()
+            .await
+            .context("error listening for upscales")?;
+
+        self.last_upscale_seqnum.store(seqnum, Ordering::Release);
+        Ok(())
+    }
+
+    /// Get the cgroup's name.
+    pub fn path(&self) -> &str {
+        self.cgroup.path()
+    }
+}
+
+/// Represents a set of limits we apply to a cgroup to control memory usage.
+///
+/// Setting these values also affects the thresholds for receiving usage alerts.
+#[derive(Debug)]
+pub struct MemoryLimits {
+    high: u64,
+    max: u64,
+}
+
+impl MemoryLimits {
+    pub fn new(high: u64, max: u64) -> Self {
+        Self { max, high }
+    }
+}
+
+// Methods for manipulating the actual cgroup
+impl CgroupWatcher {
+    /// Get a handle on the freezer subsystem.
+    fn freezer(&self) -> anyhow::Result<&FreezerController> {
+        if let Some(Freezer(freezer)) = self
+            .cgroup
+            .subsystems()
+            .iter()
+            .find(|sub| matches!(sub, Freezer(_)))
+        {
+            Ok(freezer)
+        } else {
+            anyhow::bail!("could not find freezer subsystem")
+        }
+    }
+
+    /// Attempt to freeze the cgroup.
+    pub fn freeze(&self) -> anyhow::Result<()> {
+        self.freezer()
+            .context("failed to get freezer subsystem")?
+            .freeze()
+            .context("failed to freeze")
+    }
+
+    /// Attempt to thaw the cgroup.
+    pub fn thaw(&self) -> anyhow::Result<()> {
+        self.freezer()
+            .context("failed to get freezer subsystem")?
+            .thaw()
+            .context("failed to thaw")
+    }
+
+    /// Get a handle on the memory subsystem.
+    ///
+    /// Note: this method does not require `self.memory_update_lock` because
+    /// getting a handle to the subsystem does not access any of the files we
+    /// care about, such as memory.high and memory.events
+    fn memory(&self) -> anyhow::Result<&MemController> {
+        if let Some(Mem(memory)) = self
+            .cgroup
+            .subsystems()
+            .iter()
+            .find(|sub| matches!(sub, Mem(_)))
+        {
+            Ok(memory)
+        } else {
+            anyhow::bail!("could not find memory subsystem")
+        }
+    }
+
+    /// Get cgroup current memory usage.
+    pub fn current_memory_usage(&self) -> anyhow::Result<u64> {
+        Ok(self
+            .memory()
+            .context("failed to get memory subsystem")?
+            .memory_stat()
+            .usage_in_bytes)
+    }
+
+    /// Set cgroup memory.high threshold.
+    pub fn set_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
+        self.memory()
+            .context("failed to get memory subsystem")?
+            .set_mem(cgroups_rs::memory::SetMemory {
+                low: None,
+                high: Some(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64)),
+                min: None,
+                max: None,
+            })
+            .context("failed to set memory.high")
+    }
+
+    /// Set cgroup memory.high and memory.max.
+    pub fn set_limits(&self, limits: &MemoryLimits) -> anyhow::Result<()> {
+        info!(
+            limits.high,
+            limits.max,
+            path = self.path(),
+            "writing new memory limits",
+        );
+        self.memory()
+            .context("failed to get memory subsystem while setting memory limits")?
+            .set_mem(cgroups_rs::memory::SetMemory {
+                min: None,
+                low: None,
+                high: Some(MaxValue::Value(
+                    u64::min(limits.high, i64::MAX as u64) as i64
+                )),
+                max: Some(MaxValue::Value(u64::min(limits.max, i64::MAX as u64) as i64)),
+            })
+            .context("failed to set memory limits")
+    }
+
+    /// Given some amount of available memory, set the desired cgroup memory limits
+    pub fn set_memory_limits(&mut self, available_memory: u64) -> anyhow::Result<()> {
+        let new_high = self.config.calculate_memory_high_value(available_memory);
+        let limits = MemoryLimits::new(new_high, available_memory);
+        info!(
+            path = self.path(),
+            memory = ?limits,
+            "setting cgroup memory",
+        );
+        self.set_limits(&limits)
+            .context("failed to set cgroup memory limits")?;
+        Ok(())
+    }
+
+    /// Get memory.high threshold.
+    pub fn get_high_bytes(&self) -> anyhow::Result<u64> {
+        let high = self
+            .memory()
+            .context("failed to get memory subsystem while getting memory statistics")?
+            .get_mem()
+            .map(|mem| mem.high)
+            .context("failed to get memory statistics from subsystem")?;
+        match high {
+            Some(MaxValue::Max) => Ok(i64::MAX as u64),
+            Some(MaxValue::Value(high)) => Ok(high as u64),
+            None => anyhow::bail!("failed to read memory.high from memory subsystem"),
+        }
+    }
+}
--- a/libs/vm_monitor/src/dispatcher.rs
+++ b/libs/vm_monitor/src/dispatcher.rs
@@ -0,0 +1,153 @@
+//! Managing the websocket connection and other signals in the monitor.
+//!
+//! Contains types that manage the interaction (not data interchange, see `protocol`)
+//! between agent and monitor, allowing us to to process and send messages in a
+//! straightforward way. The dispatcher also manages that signals that come from
+//! the cgroup (requesting upscale), and the signals that go to the cgroup
+//! (notifying it of upscale).
+
+use anyhow::{bail, Context};
+use axum::extract::ws::{Message, WebSocket};
+use futures::{
+    stream::{SplitSink, SplitStream},
+    SinkExt, StreamExt,
+};
+use tokio::sync::mpsc;
+use tracing::info;
+
+use crate::cgroup::Sequenced;
+use crate::protocol::{
+    OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, Resources, PROTOCOL_MAX_VERSION,
+    PROTOCOL_MIN_VERSION,
+};
+
+/// The central handler for all communications in the monitor.
+///
+/// The dispatcher has two purposes:
+/// 1. Manage the connection to the agent, sending and receiving messages.
+/// 2. Communicate with the cgroup manager, notifying it when upscale is received,
+///    and sending a message to the agent when the cgroup manager requests
+///    upscale.
+#[derive(Debug)]
+pub struct Dispatcher {
+    /// We read agent messages of of `source`
+    pub(crate) source: SplitStream<WebSocket>,
+
+    /// We send messages to the agent through `sink`
+    sink: SplitSink<WebSocket, Message>,
+
+    /// Used to notify the cgroup when we are upscaled.
+    pub(crate) notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
+
+    /// When the cgroup requests upscale it will send on this channel. In response
+    /// we send an `UpscaleRequst` to the agent.
+    pub(crate) request_upscale_events: mpsc::Receiver<()>,
+
+    /// The protocol version we have agreed to use with the agent. This is negotiated
+    /// during the creation of the dispatcher, and should be the highest shared protocol
+    /// version.
+    ///
+    // NOTE: currently unused, but will almost certainly be used in the futures
+    // as the protocol changes
+    #[allow(unused)]
+    pub(crate) proto_version: ProtocolVersion,
+}
+
+impl Dispatcher {
+    /// Creates a new dispatcher using the passed-in connection.
+    ///
+    /// Performs a negotiation with the agent to determine the highest protocol
+    /// version that both support. This consists of two steps:
+    /// 1. Wait for the agent to sent the range of protocols it supports.
+    /// 2. Send a protocol version that works for us as well, or an error if there
+    ///    is no compatible version.
+    pub async fn new(
+        stream: WebSocket,
+        notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
+        request_upscale_events: mpsc::Receiver<()>,
+    ) -> anyhow::Result<Self> {
+        let (mut sink, mut source) = stream.split();
+
+        // Figure out the highest protocol version we both support
+        info!("waiting for agent to send protocol version range");
+        let Some(message) = source.next().await else {
+            bail!("websocket connection closed while performing protocol handshake")
+        };
+
+        let message = message.context("failed to read protocol version range off connection")?;
+
+        let Message::Text(message_text) = message else {
+            // All messages should be in text form, since we don't do any
+            // pinging/ponging. See nhooyr/websocket's implementation and the
+            // agent for more info
+            bail!("received non-text message during proocol handshake: {message:?}")
+        };
+
+        let monitor_range = ProtocolRange {
+            min: PROTOCOL_MIN_VERSION,
+            max: PROTOCOL_MAX_VERSION,
+        };
+
+        let agent_range: ProtocolRange = serde_json::from_str(&message_text)
+            .context("failed to deserialize protocol version range")?;
+
+        info!(range = ?agent_range, "received protocol version range");
+
+        let highest_shared_version = match monitor_range.highest_shared_version(&agent_range) {
+            Ok(version) => {
+                sink.send(Message::Text(
+                    serde_json::to_string(&ProtocolResponse::Version(version)).unwrap(),
+                ))
+                .await
+                .context("failed to notify agent of negotiated protocol version")?;
+                version
+            }
+            Err(e) => {
+                sink.send(Message::Text(
+                    serde_json::to_string(&ProtocolResponse::Error(format!(
+                        "Received protocol version range {} which does not overlap with {}",
+                        agent_range, monitor_range
+                    )))
+                    .unwrap(),
+                ))
+                .await
+                .context("failed to notify agent of no overlap between protocol version ranges")?;
+                Err(e).context("error determining suitable protocol version range")?
+            }
+        };
+
+        Ok(Self {
+            sink,
+            source,
+            notify_upscale_events,
+            request_upscale_events,
+            proto_version: highest_shared_version,
+        })
+    }
+
+    /// Notify the cgroup manager that we have received upscale and wait for
+    /// the acknowledgement.
+    #[tracing::instrument(skip_all, fields(?resources))]
+    pub async fn notify_upscale(&self, resources: Sequenced<Resources>) -> anyhow::Result<()> {
+        self.notify_upscale_events
+            .send(resources)
+            .await
+            .context("failed to send resources and oneshot sender across channel")
+    }
+
+    /// Send a message to the agent.
+    ///
+    /// Although this function is small, it has one major benefit: it is the only
+    /// way to send data accross the connection, and you can only pass in a proper
+    /// `MonitorMessage`. Without safeguards like this, it's easy to accidentally
+    /// serialize the wrong thing and send it, since `self.sink.send` will take
+    /// any string.
+    pub async fn send(&mut self, message: OutboundMsg) -> anyhow::Result<()> {
+        info!(?message, "sending message");
+        let json = serde_json::to_string(&message).context("failed to serialize message")?;
+        self.sink
+            .send(Message::Text(json))
+            .await
+            .context("stream error sending message")
+    }
+}
--- a/libs/vm_monitor/src/filecache.rs
+++ b/libs/vm_monitor/src/filecache.rs
@@ -0,0 +1,316 @@
+//! Logic for configuring and scaling the Postgres file cache.
+
+use std::num::NonZeroU64;
+
+use crate::MiB;
+use anyhow::{anyhow, Context};
+use tokio_postgres::{types::ToSql, Client, NoTls, Row};
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info};
+
+/// Manages Postgres' file cache by keeping a connection open.
+#[derive(Debug)]
+pub struct FileCacheState {
+    client: Client,
+    conn_str: String,
+    pub(crate) config: FileCacheConfig,
+
+    /// A token for cancelling spawned threads during shutdown.
+    token: CancellationToken,
+}
+
+#[derive(Debug)]
+pub struct FileCacheConfig {
+    /// Whether the file cache is *actually* stored in memory (e.g. by writing to
+    /// a tmpfs or shmem file). If true, the size of the file cache will be counted against the
+    /// memory available for the cgroup.
+    pub(crate) in_memory: bool,
+
+    /// The size of the file cache, in terms of the size of the resource it consumes
+    /// (currently: only memory)
+    ///
+    /// For example, setting `resource_multipler = 0.75` gives the cache a target size of 75% of total
+    /// resources.
+    ///
+    /// This value must be strictly between 0 and 1.
+    resource_multiplier: f64,
+
+    /// The required minimum amount of memory, in bytes, that must remain available
+    /// after subtracting the file cache.
+    ///
+    /// This value must be non-zero.
+    min_remaining_after_cache: NonZeroU64,
+
+    /// Controls the rate of increase in the file cache's size as it grows from zero
+    /// (when total resources equals min_remaining_after_cache) to the desired size based on
+    /// `resource_multiplier`.
+    ///
+    /// A `spread_factor` of zero means that all additional resources will go to the cache until it
+    /// reaches the desired size. Setting `spread_factor` to N roughly means "for every 1 byte added to
+    /// the cache's size, N bytes are reserved for the rest of the system, until the cache gets to
+    /// its desired size".
+    ///
+    /// This value must be >= 0, and must retain an increase that is more than what would be given by
+    /// `resource_multiplier`. For example, setting `resource_multiplier` = 0.75 but `spread_factor` = 1
+    /// would be invalid, because `spread_factor` would induce only 50% usage - never reaching the 75%
+    /// as desired by `resource_multiplier`.
+    ///
+    /// `spread_factor` is too large if `(spread_factor + 1) * resource_multiplier >= 1`.
+    spread_factor: f64,
+}
+
+impl FileCacheConfig {
+    pub fn default_in_memory() -> Self {
+        Self {
+            in_memory: true,
+            // 75 %
+            resource_multiplier: 0.75,
+            // 640 MiB; (512 + 128)
+            min_remaining_after_cache: NonZeroU64::new(640 * MiB).unwrap(),
+            // ensure any increase in file cache size is split 90-10 with 10% to other memory
+            spread_factor: 0.1,
+        }
+    }
+
+    pub fn default_on_disk() -> Self {
+        Self {
+            in_memory: false,
+            resource_multiplier: 0.75,
+            // 256 MiB - lower than when in memory because overcommitting is safe; if we don't have
+            // memory, the kernel will just evict from its page cache, rather than e.g. killing
+            // everything.
+            min_remaining_after_cache: NonZeroU64::new(256 * MiB).unwrap(),
+            spread_factor: 0.1,
+        }
+    }
+
+    /// Make sure fields of the config are consistent.
+    pub fn validate(&self) -> anyhow::Result<()> {
+        // Single field validity
+        anyhow::ensure!(
+            0.0 < self.resource_multiplier && self.resource_multiplier < 1.0,
+            "resource_multiplier must be between 0.0 and 1.0 exclusive, got {}",
+            self.resource_multiplier
+        );
+        anyhow::ensure!(
+            self.spread_factor >= 0.0,
+            "spread_factor must be >= 0, got {}",
+            self.spread_factor
+        );
+
+        // Check that `resource_multiplier` and `spread_factor` are valid w.r.t. each other.
+        //
+        // As shown in `calculate_cache_size`, we have two lines resulting from `resource_multiplier` and
+        // `spread_factor`, respectively. They are:
+        //
+        //                 `total`           `min_remaining_after_cache`
+        //   size = ————————————————————— - —————————————————————————————
+        //           `spread_factor` + 1         `spread_factor` + 1
+        //
+        // and
+        //
+        //   size = `resource_multiplier` × total
+        //
+        // .. where `total` is the total resources. These are isomorphic to the typical 'y = mx + b'
+        // form, with y = "size" and x = "total".
+        //
+        // These lines intersect at:
+        //
+        //               `min_remaining_after_cache`
+        //   ———————————————————————————————————————————————————
+        //    1 - `resource_multiplier` × (`spread_factor` + 1)
+        //
+        // We want to ensure that this value (a) exists, and (b) is >= `min_remaining_after_cache`. This is
+        // guaranteed when '`resource_multiplier` × (`spread_factor` + 1)' is less than 1.
+        // (We also need it to be >= 0, but that's already guaranteed.)
+
+        let intersect_factor = self.resource_multiplier * (self.spread_factor + 1.0);
+        anyhow::ensure!(
+            intersect_factor < 1.0,
+            "incompatible resource_multipler and spread_factor"
+        );
+        Ok(())
+    }
+
+    /// Calculate the desired size of the cache, given the total memory
+    pub fn calculate_cache_size(&self, total: u64) -> u64 {
+        // *Note*: all units are in bytes, until the very last line.
+        let available = total.saturating_sub(self.min_remaining_after_cache.get());
+        if available == 0 {
+            return 0;
+        }
+
+        // Conversions to ensure we don't overflow from floating-point ops
+        let size_from_spread =
+            i64::max(0, (available as f64 / (1.0 + self.spread_factor)) as i64) as u64;
+
+        let size_from_normal = (total as f64 * self.resource_multiplier) as u64;
+
+        let byte_size = u64::min(size_from_spread, size_from_normal);
+
+        // The file cache operates in units of mebibytes, so the sizes we produce should
+        // be rounded to a mebibyte. We round down to be conservative.
+        byte_size / MiB * MiB
+    }
+}
+
+impl FileCacheState {
+    /// Connect to the file cache.
+    #[tracing::instrument(skip_all, fields(%conn_str, ?config))]
+    pub async fn new(
+        conn_str: &str,
+        config: FileCacheConfig,
+        token: CancellationToken,
+    ) -> anyhow::Result<Self> {
+        config.validate().context("file cache config is invalid")?;
+
+        info!(conn_str, "connecting to Postgres file cache");
+        let client = FileCacheState::connect(conn_str, token.clone())
+            .await
+            .context("failed to connect to postgres file cache")?;
+
+        let conn_str = conn_str.to_string();
+        Ok(Self {
+            client,
+            config,
+            conn_str,
+            token,
+        })
+    }
+
+    /// Connect to Postgres.
+    ///
+    /// Aborts the spawned thread if the kill signal is received. This is not
+    /// a method as it is called in [`FileCacheState::new`].
+    #[tracing::instrument(skip_all, fields(%conn_str))]
+    async fn connect(conn_str: &str, token: CancellationToken) -> anyhow::Result<Client> {
+        let (client, conn) = tokio_postgres::connect(conn_str, NoTls)
+            .await
+            .context("failed to connect to pg client")?;
+
+        // The connection object performs the actual communication with the database,
+        // so spawn it off to run on its own. See tokio-postgres docs.
+        crate::spawn_with_cancel(
+            token,
+            |res| {
+                if let Err(error) = res {
+                    error!(%error, "postgres error")
+                }
+            },
+            conn,
+        );
+
+        Ok(client)
+    }
+
+    /// Execute a query with a retry if necessary.
+    ///
+    /// If the initial query fails, we restart the database connection and attempt
+    /// if again.
+    #[tracing::instrument(skip_all, fields(%statement))]
+    pub async fn query_with_retry(
+        &mut self,
+        statement: &str,
+        params: &[&(dyn ToSql + Sync)],
+    ) -> anyhow::Result<Vec<Row>> {
+        match self
+            .client
+            .query(statement, params)
+            .await
+            .context("failed to execute query")
+        {
+            Ok(rows) => Ok(rows),
+            Err(e) => {
+                error!(error = ?e, "postgres error: {e} -> retrying");
+
+                let client = FileCacheState::connect(&self.conn_str, self.token.clone())
+                    .await
+                    .context("failed to connect to postgres file cache")?;
+                info!("successfully reconnected to postgres client");
+
+                // Replace the old client and attempt the query with the new one
+                self.client = client;
+                self.client
+                    .query(statement, params)
+                    .await
+                    .context("failed to execute query a second time")
+            }
+        }
+    }
+
+    /// Get the current size of the file cache.
+    #[tracing::instrument(skip_all)]
+    pub async fn get_file_cache_size(&mut self) -> anyhow::Result<u64> {
+        self.query_with_retry(
+            // The file cache GUC variable is in MiB, but the conversion with
+            // pg_size_bytes means that the end result we get is in bytes.
+            "SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit'));",
+            &[],
+        )
+        .await
+        .context("failed to query pg for file cache size")?
+        .first()
+        .ok_or_else(|| anyhow!("file cache size query returned no rows"))?
+        // pg_size_bytes returns a bigint which is the same as an i64.
+        .try_get::<_, i64>(0)
+        // Since the size of the table is not negative, the cast is sound.
+        .map(|bytes| bytes as u64)
+        .context("failed to extract file cache size from query result")
+    }
+
+    /// Attempt to set the file cache size, returning the size it was actually
+    /// set to.
+    #[tracing::instrument(skip_all, fields(%num_bytes))]
+    pub async fn set_file_cache_size(&mut self, num_bytes: u64) -> anyhow::Result<u64> {
+        let max_bytes = self
+            // The file cache GUC variable is in MiB, but the conversion with pg_size_bytes
+            // means that the end result we get is in bytes.
+            .query_with_retry(
+                "SELECT pg_size_bytes(current_setting('neon.max_file_cache_size'));",
+                &[],
+            )
+            .await
+            .context("failed to query pg for max file cache size")?
+            .first()
+            .ok_or_else(|| anyhow!("max file cache size query returned no rows"))?
+            .try_get::<_, i64>(0)
+            .map(|bytes| bytes as u64)
+            .context("failed to extract max file cache size from query result")?;
+
+        let max_mb = max_bytes / MiB;
+        let num_mb = u64::min(num_bytes, max_bytes) / MiB;
+
+        let capped = if num_bytes > max_bytes {
+            " (capped by maximum size)"
+        } else {
+            ""
+        };
+
+        info!(
+            size = num_mb,
+            max = max_mb,
+            "updating file cache size {capped}",
+        );
+
+        // note: even though the normal ways to get the cache size produce values with trailing "MB"
+        // (hence why we call pg_size_bytes in `get_file_cache_size`'s query), the format
+        // it expects to set the value is "integer number of MB" without trailing units.
+        // For some reason, this *really* wasn't working with normal arguments, so that's
+        // why we're constructing the query here.
+        self.client
+            .query(
+                &format!("ALTER SYSTEM SET neon.file_cache_size_limit = {};", num_mb),
+                &[],
+            )
+            .await
+            .context("failed to change file cache size limit")?;
+
+        // must use pg_reload_conf to have the settings change take effect
+        self.client
+            .execute("SELECT pg_reload_conf();", &[])
+            .await
+            .context("failed to reload config")?;
+
+        Ok(num_mb * MiB)
+    }
+}
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -0,0 +1,215 @@
+#![cfg(target_os = "linux")]
+
+use anyhow::Context;
+use axum::{
+    extract::{ws::WebSocket, State, WebSocketUpgrade},
+    response::Response,
+};
+use axum::{routing::get, Router, Server};
+use clap::Parser;
+use futures::Future;
+use std::{fmt::Debug, time::Duration};
+use sysinfo::{RefreshKind, System, SystemExt};
+use tokio::{sync::broadcast, task::JoinHandle};
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info};
+
+use runner::Runner;
+
+// Code that interfaces with agent
+pub mod dispatcher;
+pub mod protocol;
+
+pub mod cgroup;
+pub mod filecache;
+pub mod runner;
+
+/// The vm-monitor is an autoscaling component started by compute_ctl.
+///
+/// It carries out autoscaling decisions (upscaling/downscaling) and responds to
+/// memory pressure by making requests to the autoscaler-agent.
+#[derive(Debug, Parser)]
+pub struct Args {
+    /// The name of the cgroup we should monitor for memory.high events. This
+    /// is the cgroup that postgres should be running in.
+    #[arg(short, long)]
+    pub cgroup: Option<String>,
+
+    /// The connection string for the Postgres file cache we should manage.
+    #[arg(short, long)]
+    pub pgconnstr: Option<String>,
+
+    /// Flag to signal that the Postgres file cache is on disk (i.e. not in memory aside from the
+    /// kernel's page cache), and therefore should not count against available memory.
+    //
+    // NB: Ideally this flag would directly refer to whether the file cache is in memory (rather
+    // than a roundabout way, via whether it's on disk), but in order to be backwards compatible
+    // during the switch away from an in-memory file cache, we had to default to the previous
+    // behavior.
+    #[arg(long)]
+    pub file_cache_on_disk: bool,
+
+    /// The address we should listen on for connection requests. For the
+    /// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
+    #[arg(short, long)]
+    pub addr: String,
+}
+
+impl Args {
+    pub fn addr(&self) -> &str {
+        &self.addr
+    }
+}
+
+/// The number of bytes in one mebibyte.
+#[allow(non_upper_case_globals)]
+const MiB: u64 = 1 << 20;
+
+/// Convert a quantity in bytes to a quantity in mebibytes, generally for display
+/// purposes. (Most calculations in this crate use bytes directly)
+pub fn bytes_to_mebibytes(bytes: u64) -> f32 {
+    (bytes as f32) / (MiB as f32)
+}
+
+pub fn get_total_system_memory() -> u64 {
+    System::new_with_specifics(RefreshKind::new().with_memory()).total_memory()
+}
+
+/// Global app state for the Axum server
+#[derive(Debug, Clone)]
+pub struct ServerState {
+    /// Used to close old connections.
+    ///
+    /// When a new connection is made, we send a message signalling to the old
+    /// connection to close.
+    pub sender: broadcast::Sender<()>,
+
+    /// Used to cancel all spawned threads in the monitor.
+    pub token: CancellationToken,
+
+    // The CLI args
+    pub args: &'static Args,
+}
+
+/// Spawn a thread that may get cancelled by the provided [`CancellationToken`].
+///
+/// This is mainly meant to be called with futures that will be pending for a very
+/// long time, or are not mean to return. If it is not desirable for the future to
+/// ever resolve, such as in the case of [`cgroup::CgroupWatcher::watch`], the error can
+/// be logged with `f`.
+pub fn spawn_with_cancel<T, F>(
+    token: CancellationToken,
+    f: F,
+    future: T,
+) -> JoinHandle<Option<T::Output>>
+where
+    T: Future + Send + 'static,
+    T::Output: Send + 'static,
+    F: FnOnce(&T::Output) + Send + 'static,
+{
+    tokio::spawn(async move {
+        tokio::select! {
+            _ = token.cancelled() => {
+                info!("received global kill signal");
+                None
+            }
+            res = future => {
+                f(&res);
+                Some(res)
+            }
+        }
+    })
+}
+
+/// The entrypoint to the binary.
+///
+/// Set up tracing, parse arguments, and start an http server.
+pub async fn start(args: &'static Args, token: CancellationToken) -> anyhow::Result<()> {
+    // This channel is used to close old connections. When a new connection is
+    // made, we send a message signalling to the old connection to close.
+    let (sender, _) = tokio::sync::broadcast::channel::<()>(1);
+
+    let app = Router::new()
+        // This route gets upgraded to a websocket connection. We only support
+        // one connection at a time, which we enforce by killing old connections
+        // when we receive a new one.
+        .route("/monitor", get(ws_handler))
+        .with_state(ServerState {
+            sender,
+            token,
+            args,
+        });
+
+    let addr = args.addr();
+    let bound = Server::try_bind(&addr.parse().expect("parsing address should not fail"))
+        .with_context(|| format!("failed to bind to {addr}"))?;
+
+    info!(addr, "server bound");
+
+    bound
+        .serve(app.into_make_service())
+        .await
+        .context("server exited")?;
+
+    Ok(())
+}
+
+/// Handles incoming websocket connections.
+///
+/// If we are already to connected to an agent, we kill that old connection
+/// and accept the new one.
+#[tracing::instrument(name = "/monitor", skip_all, fields(?args))]
+pub async fn ws_handler(
+    ws: WebSocketUpgrade,
+    State(ServerState {
+        sender,
+        token,
+        args,
+    }): State<ServerState>,
+) -> Response {
+    // Kill the old monitor
+    info!("closing old connection if there is one");
+    let _ = sender.send(());
+
+    // Start the new one. Wow, the cycle of death and rebirth
+    let closer = sender.subscribe();
+    ws.on_upgrade(|ws| start_monitor(ws, args, closer, token))
+}
+
+/// Starts the monitor. If startup fails or the monitor exits, an error will
+/// be logged and our internal state will be reset to allow for new connections.
+#[tracing::instrument(skip_all, fields(?args))]
+async fn start_monitor(
+    ws: WebSocket,
+    args: &Args,
+    kill: broadcast::Receiver<()>,
+    token: CancellationToken,
+) {
+    info!("accepted new websocket connection -> starting monitor");
+    let timeout = Duration::from_secs(4);
+    let monitor = tokio::time::timeout(
+        timeout,
+        Runner::new(Default::default(), args, ws, kill, token),
+    )
+    .await;
+    let mut monitor = match monitor {
+        Ok(Ok(monitor)) => monitor,
+        Ok(Err(error)) => {
+            error!(?error, "failed to create monitor");
+            return;
+        }
+        Err(_) => {
+            error!(
+                ?timeout,
+                "creating monitor timed out (probably waiting to receive protocol range)"
+            );
+            return;
+        }
+    };
+    info!("connected to agent");
+
+    match monitor.run().await {
+        Ok(()) => info!("monitor was killed due to new connection"),
+        Err(e) => error!(error = ?e, "monitor terminated unexpectedly"),
+    }
+}
--- a/libs/vm_monitor/src/protocol.rs
+++ b/libs/vm_monitor/src/protocol.rs
@@ -0,0 +1,241 @@
+//! Types representing protocols and actual agent-monitor messages.
+//!
+//! The pervasive use of serde modifiers throughout this module is to ease
+//! serialization on the go side. Because go does not have enums (which model
+//! messages well), it is harder to model messages, and we accomodate that with
+//! serde.
+//!
+//! *Note*: the agent sends and receives messages in different ways.
+//!
+//! The agent serializes messages in the form and then sends them. The use
+//! of `#[serde(tag = "type", content = "content")]` allows us to use `Type`
+//! to determine how to deserialize `Content`.
+//! ```ignore
+//! struct {
+//!     Content any
+//!     Type    string
+//!     Id      uint64
+//! }
+//! ```
+//! and receives messages in the form:
+//! ```ignore
+//! struct {
+//!     {fields embedded}
+//!     Type string
+//!     Id   uint64
+//! }
+//! ```
+//! After reading the type field, the agent will decode the entire message
+//! again, this time into the correct type using the embedded fields.
+//! Because the agent cannot just extract the json contained in a certain field
+//! (it initially deserializes to `map[string]interface{}`), we keep the fields
+//! at the top level, so the entire piece of json can be deserialized into a struct,
+//! such as a `DownscaleResult`, with the `Type` and `Id` fields ignored.
+
+use core::fmt;
+use std::cmp;
+
+use serde::{de::Error, Deserialize, Serialize};
+
+/// A Message we send to the agent.
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct OutboundMsg {
+    #[serde(flatten)]
+    pub(crate) inner: OutboundMsgKind,
+    pub(crate) id: usize,
+}
+
+impl OutboundMsg {
+    pub fn new(inner: OutboundMsgKind, id: usize) -> Self {
+        Self { inner, id }
+    }
+}
+
+/// The different underlying message types we can send to the agent.
+#[derive(Serialize, Deserialize, Debug, Clone)]
+#[serde(tag = "type")]
+pub enum OutboundMsgKind {
+    /// Indicates that the agent sent an invalid message, i.e, we couldn't
+    /// properly deserialize it.
+    InvalidMessage { error: String },
+    /// Indicates that we experienced an internal error while processing a message.
+    /// For example, if a cgroup operation fails while trying to handle an upscale,
+    /// we return `InternalError`.
+    InternalError { error: String },
+    /// Returned to the agent once we have finished handling an upscale. If the
+    /// handling was unsuccessful, an `InternalError` will get returned instead.
+    /// *Note*: this is a struct variant because of the way go serializes struct{}
+    UpscaleConfirmation {},
+    /// Indicates to the monitor that we are urgently requesting resources.
+    /// *Note*: this is a struct variant because of the way go serializes struct{}
+    UpscaleRequest {},
+    /// Returned to the agent once we have finished attempting to downscale. If
+    /// an error occured trying to do so, an `InternalError` will get returned instead.
+    /// However, if we are simply unsuccessful (for example, do to needing the resources),
+    /// that gets included in the `DownscaleResult`.
+    DownscaleResult {
+        // FIXME for the future (once the informant is deprecated)
+        // As of the time of writing, the agent/informant version of this struct is
+        // called api.DownscaleResult. This struct has uppercase fields which are
+        // serialized as such. Thus, we serialize using uppercase names so we don't
+        // have to make a breaking change to the agent<->informant protocol. Once
+        // the informant has been superseded by the monitor, we can add the correct
+        // struct tags to api.DownscaleResult without causing a breaking change,
+        // since we don't need to support the agent<->informant protocol anymore.
+        #[serde(rename = "Ok")]
+        ok: bool,
+        #[serde(rename = "Status")]
+        status: String,
+    },
+    /// Part of the bidirectional heartbeat. The heartbeat is initiated by the
+    /// agent.
+    /// *Note*: this is a struct variant because of the way go serializes struct{}
+    HealthCheck {},
+}
+
+/// A message received form the agent.
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct InboundMsg {
+    #[serde(flatten)]
+    pub(crate) inner: InboundMsgKind,
+    pub(crate) id: usize,
+}
+
+/// The different underlying message types we can receive from the agent.
+#[derive(Serialize, Deserialize, Debug, Clone)]
+#[serde(tag = "type", content = "content")]
+pub enum InboundMsgKind {
+    /// Indicates that the we sent an invalid message, i.e, we couldn't
+    /// properly deserialize it.
+    InvalidMessage { error: String },
+    /// Indicates that the informan experienced an internal error while processing
+    /// a message. For example, if it failed to request upsacle from the agent, it
+    /// would return an `InternalError`.
+    InternalError { error: String },
+    /// Indicates to us that we have been granted more resources. We should respond
+    /// with an `UpscaleConfirmation` when done handling the resources (increasins
+    /// file cache size, cgorup memory limits).
+    UpscaleNotification { granted: Resources },
+    /// A request to reduce resource usage. We should response with a `DownscaleResult`,
+    /// when done.
+    DownscaleRequest { target: Resources },
+    /// Part of the bidirectional heartbeat. The heartbeat is initiated by the
+    /// agent.
+    /// *Note*: this is a struct variant because of the way go serializes struct{}
+    HealthCheck {},
+}
+
+/// Represents the resources granted to a VM.
+#[derive(Serialize, Deserialize, Debug, Clone, Copy)]
+// Renamed because the agent has multiple resources types:
+// `Resources` (milliCPU/memory slots)
+// `Allocation` (vCPU/bytes) <- what we correspond to
+#[serde(rename(serialize = "Allocation", deserialize = "Allocation"))]
+pub struct Resources {
+    /// Number of vCPUs
+    pub(crate) cpu: f64,
+    /// Bytes of memory
+    pub(crate) mem: u64,
+}
+
+impl Resources {
+    pub fn new(cpu: f64, mem: u64) -> Self {
+        Self { cpu, mem }
+    }
+}
+
+pub const PROTOCOL_MIN_VERSION: ProtocolVersion = ProtocolVersion::V1_0;
+pub const PROTOCOL_MAX_VERSION: ProtocolVersion = ProtocolVersion::V1_0;
+
+#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Ord, Eq, Serialize, Deserialize)]
+pub struct ProtocolVersion(u8);
+
+impl ProtocolVersion {
+    /// Represents v1.0 of the agent<-> monitor protocol - the initial version
+    ///
+    /// Currently the latest version.
+    const V1_0: ProtocolVersion = ProtocolVersion(1);
+}
+
+impl fmt::Display for ProtocolVersion {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match *self {
+            ProtocolVersion(0) => f.write_str("<invalid: zero>"),
+            ProtocolVersion::V1_0 => f.write_str("v1.0"),
+            other => write!(f, "<unknown: {other}>"),
+        }
+    }
+}
+
+/// A set of protocol bounds that determines what we are speaking.
+///
+/// These bounds are inclusive.
+#[derive(Debug)]
+pub struct ProtocolRange {
+    pub min: ProtocolVersion,
+    pub max: ProtocolVersion,
+}
+
+// Use a custom deserialize impl to ensure that `self.min <= self.max`
+impl<'de> Deserialize<'de> for ProtocolRange {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        #[derive(Deserialize)]
+        struct InnerProtocolRange {
+            min: ProtocolVersion,
+            max: ProtocolVersion,
+        }
+        let InnerProtocolRange { min, max } = InnerProtocolRange::deserialize(deserializer)?;
+        if min > max {
+            Err(D::Error::custom(format!(
+                "min version = {min} is greater than max version = {max}",
+            )))
+        } else {
+            Ok(ProtocolRange { min, max })
+        }
+    }
+}
+
+impl fmt::Display for ProtocolRange {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        if self.min == self.max {
+            f.write_fmt(format_args!("{}", self.max))
+        } else {
+            f.write_fmt(format_args!("{} to {}", self.min, self.max))
+        }
+    }
+}
+
+impl ProtocolRange {
+    /// Find the highest shared version between two `ProtocolRange`'s
+    pub fn highest_shared_version(&self, other: &Self) -> anyhow::Result<ProtocolVersion> {
+        // We first have to make sure the ranges are overlapping. Once we know
+        // this, we can merge the ranges by taking the max of the mins and the
+        // mins of the maxes.
+        if self.min > other.max {
+            anyhow::bail!(
+                "Non-overlapping bounds: other.max = {} was less than self.min = {}",
+                other.max,
+                self.min,
+            )
+        } else if self.max < other.min {
+            anyhow::bail!(
+                "Non-overlappinng bounds: self.max = {} was less than other.min = {}",
+                self.max,
+                other.min
+            )
+        } else {
+            Ok(cmp::min(self.max, other.max))
+        }
+    }
+}
+
+/// We send this to the monitor after negotiating which protocol to use
+#[derive(Serialize, Debug)]
+#[serde(rename_all = "camelCase")]
+pub enum ProtocolResponse {
+    Error(String),
+    Version(ProtocolVersion),
+}
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -0,0 +1,460 @@
+//! Exposes the `Runner`, which handles messages received from agent and
+//! sends upscale requests.
+//!
+//! This is the "Monitor" part of the monitor binary and is the main entrypoint for
+//! all functionality.
+
+use std::sync::Arc;
+use std::{fmt::Debug, mem};
+
+use anyhow::{bail, Context};
+use axum::extract::ws::{Message, WebSocket};
+use futures::StreamExt;
+use tokio::sync::broadcast;
+use tokio::sync::mpsc;
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info, warn};
+
+use crate::cgroup::{CgroupWatcher, MemoryLimits, Sequenced};
+use crate::dispatcher::Dispatcher;
+use crate::filecache::{FileCacheConfig, FileCacheState};
+use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
+use crate::{bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel, Args, MiB};
+
+/// Central struct that interacts with agent, dispatcher, and cgroup to handle
+/// signals from the agent.
+#[derive(Debug)]
+pub struct Runner {
+    config: Config,
+    filecache: Option<FileCacheState>,
+    cgroup: Option<Arc<CgroupWatcher>>,
+    dispatcher: Dispatcher,
+
+    /// We "mint" new message ids by incrementing this counter and taking the value.
+    ///
+    /// **Note**: This counter is always odd, so that we avoid collisions between the IDs generated
+    /// by us vs the autoscaler-agent.
+    counter: usize,
+
+    /// A signal to kill the main thread produced by `self.run()`. This is triggered
+    /// when the server receives a new connection. When the thread receives the
+    /// signal off this channel, it will gracefully shutdown.
+    kill: broadcast::Receiver<()>,
+}
+
+/// Configuration for a `Runner`
+#[derive(Debug)]
+pub struct Config {
+    /// `sys_buffer_bytes` gives the estimated amount of memory, in bytes, that the kernel uses before
+    /// handing out the rest to userspace. This value is the estimated difference between the
+    /// *actual* physical memory and the amount reported by `grep MemTotal /proc/meminfo`.
+    ///
+    /// For more information, refer to `man 5 proc`, which defines MemTotal as "Total usable RAM
+    /// (i.e., physical RAM minus a few reserved bits and the kernel binary code)".
+    ///
+    /// We only use `sys_buffer_bytes` when calculating the system memory from the *external* memory
+    /// size, rather than the self-reported memory size, according to the kernel.
+    ///
+    /// TODO: this field is only necessary while we still have to trust the autoscaler-agent's
+    /// upscale resource amounts (because we might not *actually* have been upscaled yet). This field
+    /// should be removed once we have a better solution there.
+    sys_buffer_bytes: u64,
+}
+
+impl Default for Config {
+    fn default() -> Self {
+        Self {
+            sys_buffer_bytes: 100 * MiB,
+        }
+    }
+}
+
+impl Runner {
+    /// Create a new monitor.
+    #[tracing::instrument(skip_all, fields(?config, ?args))]
+    pub async fn new(
+        config: Config,
+        args: &Args,
+        ws: WebSocket,
+        kill: broadcast::Receiver<()>,
+        token: CancellationToken,
+    ) -> anyhow::Result<Runner> {
+        anyhow::ensure!(
+            config.sys_buffer_bytes != 0,
+            "invalid monitor Config: sys_buffer_bytes cannot be 0"
+        );
+
+        // *NOTE*: the dispatcher and cgroup manager talk through these channels
+        // so make sure they each get the correct half, nothing is droppped, etc.
+        let (notified_send, notified_recv) = mpsc::channel(1);
+        let (requesting_send, requesting_recv) = mpsc::channel(1);
+
+        let dispatcher = Dispatcher::new(ws, notified_send, requesting_recv)
+            .await
+            .context("error creating new dispatcher")?;
+
+        let mut state = Runner {
+            config,
+            filecache: None,
+            cgroup: None,
+            dispatcher,
+            counter: 1, // NB: must be odd, see the comment about the field for more.
+            kill,
+        };
+
+        let mut file_cache_reserved_bytes = 0;
+        let mem = get_total_system_memory();
+
+        // We need to process file cache initialization before cgroup initialization, so that the memory
+        // allocated to the file cache is appropriately taken into account when we decide the cgroup's
+        // memory limits.
+        if let Some(connstr) = &args.pgconnstr {
+            info!("initializing file cache");
+            let config = match args.file_cache_on_disk {
+                true => FileCacheConfig::default_on_disk(),
+                false => FileCacheConfig::default_in_memory(),
+            };
+
+            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
+                .await
+                .context("failed to create file cache")?;
+
+            let size = file_cache
+                .get_file_cache_size()
+                .await
+                .context("error getting file cache size")?;
+
+            let new_size = file_cache.config.calculate_cache_size(mem);
+            info!(
+                initial = bytes_to_mebibytes(size),
+                new = bytes_to_mebibytes(new_size),
+                "setting initial file cache size",
+            );
+
+            // note: even if size == new_size, we want to explicitly set it, just
+            // to make sure that we have the permissions to do so
+            let actual_size = file_cache
+                .set_file_cache_size(new_size)
+                .await
+                .context("failed to set file cache size, possibly due to inadequate permissions")?;
+            if actual_size != new_size {
+                info!("file cache size actually got set to {actual_size}")
+            }
+            // Mark the resources given to the file cache as reserved, but only if it's in memory.
+            if !args.file_cache_on_disk {
+                file_cache_reserved_bytes = actual_size;
+            }
+
+            state.filecache = Some(file_cache);
+        }
+
+        if let Some(name) = &args.cgroup {
+            let (mut cgroup, cgroup_event_stream) =
+                CgroupWatcher::new(name.clone(), requesting_send)
+                    .context("failed to create cgroup manager")?;
+
+            let available = mem - file_cache_reserved_bytes;
+
+            cgroup
+                .set_memory_limits(available)
+                .context("failed to set cgroup memory limits")?;
+
+            let cgroup = Arc::new(cgroup);
+
+            // Some might call this . . . cgroup v2
+            let cgroup_clone = Arc::clone(&cgroup);
+
+            spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move {
+                cgroup_clone.watch(notified_recv, cgroup_event_stream).await
+            });
+
+            state.cgroup = Some(cgroup);
+        } else {
+            // *NOTE*: We need to forget the sender so that its drop impl does not get ran.
+            // This allows us to poll it in `Monitor::run` regardless of whether we
+            // are managing a cgroup or not. If we don't forget it, all receives will
+            // immediately return an error because the sender is droped and it will
+            // claim all select! statements, effectively turning `Monitor::run` into
+            // `loop { fail to receive }`.
+            mem::forget(requesting_send);
+        }
+
+        Ok(state)
+    }
+
+    /// Attempt to downscale filecache + cgroup
+    #[tracing::instrument(skip_all, fields(?target))]
+    pub async fn try_downscale(&mut self, target: Resources) -> anyhow::Result<(bool, String)> {
+        // Nothing to adjust
+        if self.cgroup.is_none() && self.filecache.is_none() {
+            info!("no action needed for downscale (no cgroup or file cache enabled)");
+            return Ok((
+                true,
+                "monitor is not managing cgroup or file cache".to_string(),
+            ));
+        }
+
+        let requested_mem = target.mem;
+        let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
+        let expected_file_cache_mem_usage = self
+            .filecache
+            .as_ref()
+            .map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
+            .unwrap_or(0);
+        let mut new_cgroup_mem_high = 0;
+        if let Some(cgroup) = &self.cgroup {
+            new_cgroup_mem_high = cgroup
+                .config
+                .calculate_memory_high_value(usable_system_memory - expected_file_cache_mem_usage);
+
+            let current = cgroup
+                .current_memory_usage()
+                .context("failed to fetch cgroup memory")?;
+
+            if new_cgroup_mem_high < current + cgroup.config.memory_high_buffer_bytes {
+                let status = format!(
+                    "{}: {} MiB (new high) < {} (current usage) + {} (buffer)",
+                    "calculated memory.high too low",
+                    bytes_to_mebibytes(new_cgroup_mem_high),
+                    bytes_to_mebibytes(current),
+                    bytes_to_mebibytes(cgroup.config.memory_high_buffer_bytes)
+                );
+
+                info!(status, "discontinuing downscale");
+
+                return Ok((false, status));
+            }
+        }
+
+        // The downscaling has been approved. Downscale the file cache, then the cgroup.
+        let mut status = vec![];
+        let mut file_cache_mem_usage = 0;
+        if let Some(file_cache) = &mut self.filecache {
+            let actual_usage = file_cache
+                .set_file_cache_size(expected_file_cache_mem_usage)
+                .await
+                .context("failed to set file cache size")?;
+            if file_cache.config.in_memory {
+                file_cache_mem_usage = actual_usage;
+            }
+            let message = format!(
+                "set file cache size to {} MiB (in memory = {})",
+                bytes_to_mebibytes(actual_usage),
+                file_cache.config.in_memory,
+            );
+            info!("downscale: {message}");
+            status.push(message);
+        }
+
+        if let Some(cgroup) = &self.cgroup {
+            let available_memory = usable_system_memory - file_cache_mem_usage;
+
+            if file_cache_mem_usage != expected_file_cache_mem_usage {
+                new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
+            }
+
+            let limits = MemoryLimits::new(
+                // new_cgroup_mem_high is initialized to 0 but it is guarancontextd to not be here
+                // since it is properly initialized in the previous cgroup if let block
+                new_cgroup_mem_high,
+                available_memory,
+            );
+            cgroup
+                .set_limits(&limits)
+                .context("failed to set cgroup memory limits")?;
+
+            let message = format!(
+                "set cgroup memory.high to {} MiB, of new max {} MiB",
+                bytes_to_mebibytes(new_cgroup_mem_high),
+                bytes_to_mebibytes(available_memory)
+            );
+            info!("downscale: {message}");
+            status.push(message);
+        }
+
+        // TODO: make this status thing less jank
+        let status = status.join("; ");
+        Ok((true, status))
+    }
+
+    /// Handle new resources
+    #[tracing::instrument(skip_all, fields(?resources))]
+    pub async fn handle_upscale(&mut self, resources: Resources) -> anyhow::Result<()> {
+        if self.filecache.is_none() && self.cgroup.is_none() {
+            info!("no action needed for upscale (no cgroup or file cache enabled)");
+            return Ok(());
+        }
+
+        let new_mem = resources.mem;
+        let usable_system_memory = new_mem.saturating_sub(self.config.sys_buffer_bytes);
+
+        // Get the file cache's expected contribution to the memory usage
+        let mut file_cache_mem_usage = 0;
+        if let Some(file_cache) = &mut self.filecache {
+            let expected_usage = file_cache.config.calculate_cache_size(usable_system_memory);
+            info!(
+                target = bytes_to_mebibytes(expected_usage),
+                total = bytes_to_mebibytes(new_mem),
+                "updating file cache size",
+            );
+
+            let actual_usage = file_cache
+                .set_file_cache_size(expected_usage)
+                .await
+                .context("failed to set file cache size")?;
+            if file_cache.config.in_memory {
+                file_cache_mem_usage = actual_usage;
+            }
+
+            if actual_usage != expected_usage {
+                warn!(
+                    "file cache was set to a different size that we wanted: target = {} Mib, actual= {} Mib",
+                    bytes_to_mebibytes(expected_usage),
+                    bytes_to_mebibytes(actual_usage)
+                )
+            }
+        }
+
+        if let Some(cgroup) = &self.cgroup {
+            let available_memory = usable_system_memory - file_cache_mem_usage;
+            let new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
+            info!(
+                target = bytes_to_mebibytes(new_cgroup_mem_high),
+                total = bytes_to_mebibytes(new_mem),
+                name = cgroup.path(),
+                "updating cgroup memory.high",
+            );
+            let limits = MemoryLimits::new(new_cgroup_mem_high, available_memory);
+            cgroup
+                .set_limits(&limits)
+                .context("failed to set file cache size")?;
+        }
+
+        Ok(())
+    }
+
+    /// Take in a message and perform some action, such as downscaling or upscaling,
+    /// and return a message to be send back.
+    #[tracing::instrument(skip_all, fields(%id, message = ?inner))]
+    pub async fn process_message(
+        &mut self,
+        InboundMsg { inner, id }: InboundMsg,
+    ) -> anyhow::Result<Option<OutboundMsg>> {
+        match inner {
+            InboundMsgKind::UpscaleNotification { granted } => {
+                self.handle_upscale(granted)
+                    .await
+                    .context("failed to handle upscale")?;
+                self.dispatcher
+                    .notify_upscale(Sequenced::new(granted))
+                    .await
+                    .context("failed to notify notify cgroup of upscale")?;
+                Ok(Some(OutboundMsg::new(
+                    OutboundMsgKind::UpscaleConfirmation {},
+                    id,
+                )))
+            }
+            InboundMsgKind::DownscaleRequest { target } => self
+                .try_downscale(target)
+                .await
+                .context("failed to downscale")
+                .map(|(ok, status)| {
+                    Some(OutboundMsg::new(
+                        OutboundMsgKind::DownscaleResult { ok, status },
+                        id,
+                    ))
+                }),
+            InboundMsgKind::InvalidMessage { error } => {
+                warn!(
+                    %error, id, "received notification of an invalid message we sent"
+                );
+                Ok(None)
+            }
+            InboundMsgKind::InternalError { error } => {
+                warn!(error, id, "agent experienced an internal error");
+                Ok(None)
+            }
+            InboundMsgKind::HealthCheck {} => {
+                Ok(Some(OutboundMsg::new(OutboundMsgKind::HealthCheck {}, id)))
+            }
+        }
+    }
+
+    // TODO: don't propagate errors, probably just warn!?
+    #[tracing::instrument(skip_all)]
+    pub async fn run(&mut self) -> anyhow::Result<()> {
+        info!("starting dispatcher");
+        loop {
+            tokio::select! {
+                signal = self.kill.recv() => {
+                    match signal {
+                        Ok(()) => return Ok(()),
+                        Err(e) => bail!("failed to receive kill signal: {e}")
+                    }
+                }
+                // we need to propagate an upscale request
+                request = self.dispatcher.request_upscale_events.recv() => {
+                    if request.is_none() {
+                        bail!("failed to listen for upscale event from cgroup")
+                    }
+                    info!("cgroup asking for upscale; forwarding request");
+                    self.counter += 2; // Increment, preserving parity (i.e. keep the
+                                       // counter odd). See the field comment for more.
+                    self.dispatcher
+                        .send(OutboundMsg::new(OutboundMsgKind::UpscaleRequest {}, self.counter))
+                        .await
+                        .context("failed to send message")?;
+                }
+                // there is a message from the agent
+                msg = self.dispatcher.source.next() => {
+                    if let Some(msg) = msg {
+                        // Don't use 'message' as a key as the string also uses
+                        // that for its key
+                        info!(?msg, "received message");
+                        match msg {
+                            Ok(msg) => {
+                                let message: InboundMsg = match msg {
+                                    Message::Text(text) => {
+                                        serde_json::from_str(&text).context("failed to deserialize text message")?
+                                    }
+                                    other => {
+                                        warn!(
+                                            // Don't use 'message' as a key as the
+                                            // string also uses that for its key
+                                            msg = ?other,
+                                            "agent should only send text messages but received different type"
+                                        );
+                                        continue
+                                    },
+                                };
+
+                                let out = match self.process_message(message.clone()).await {
+                                    Ok(Some(out)) => out,
+                                    Ok(None) => continue,
+                                    Err(e) => {
+                                        let error = e.to_string();
+                                        warn!(?error, "error handling message");
+                                        OutboundMsg::new(
+                                            OutboundMsgKind::InternalError {
+                                                error
+                                            },
+                                            message.id
+                                        )
+                                    }
+                                };
+
+                                self.dispatcher
+                                    .send(out)
+                                    .await
+                                    .context("failed to send message")?;
+                            }
+                            Err(e) => warn!("{e}"),
+                        }
+                    } else {
+                        anyhow::bail!("dispatcher connection closed")
+                    }
+                }
+            }
+        }
+    }
+}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -51,6 +51,7 @@ serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 serde_with.workspace = true
 signal-hook.workspace = true
+smallvec = { workspace = true, features = ["write"] }
 svg_fmt.workspace = true
 sync_wrapper.workspace = true
 tokio-tar.workspace = true
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -10,7 +10,7 @@ use std::{fs, path::Path, str};

 use pageserver::page_cache::PAGE_SZ;
 use pageserver::repository::{Key, KEY_SIZE};
-use pageserver::tenant::block_io::{BlockReader, FileBlockReader};
+use pageserver::tenant::block_io::FileBlockReader;
 use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
 use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
 use pageserver::tenant::storage_layer::range_overlaps;
@@ -97,7 +97,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
 async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
    let file = FileBlockReader::new(VirtualFile::open(path)?);
-    let summary_blk = file.read_blk(0)?;
+    let summary_blk = file.read_blk(0).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
        actual_summary.index_start_blk,
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -44,13 +44,11 @@ pub(crate) enum LayerCmd {
 }

 async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
-    use pageserver::tenant::block_io::BlockReader;
-
    let path = path.as_ref();
    virtual_file::init(10);
    page_cache::init(100);
    let file = FileBlockReader::new(VirtualFile::open(path)?);
-    let summary_blk = file.read_blk(0)?;
+    let summary_blk = file.read_blk(0).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
        actual_summary.index_start_blk,
@@ -70,7 +68,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
            },
        )
        .await?;
-    let cursor = BlockCursor::new(&file);
+    let cursor = BlockCursor::new_fileblockreader_virtual(&file);
    for (k, v) in all {
        let value = cursor.read_blob(v.pos()).await?;
        println!("key:{} value_len:{}", k, value.len());
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -643,23 +643,6 @@ impl PageServerConf {
            .join(METADATA_FILE_NAME)
    }

-    /// Files on the remote storage are stored with paths, relative to the workdir.
-    /// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path.
-    ///
-    /// Errors if the path provided does not start from pageserver's workdir.
-    pub fn remote_path(&self, local_path: &Path) -> anyhow::Result<RemotePath> {
-        local_path
-            .strip_prefix(&self.workdir)
-            .context("Failed to strip workdir prefix")
-            .and_then(RemotePath::new)
-            .with_context(|| {
-                format!(
-                    "Failed to resolve remote part of path {:?} for base {:?}",
-                    local_path, self.workdir
-                )
-            })
-    }
-
    /// Turns storage remote path of a file into its local path.
    pub fn local_path(&self, remote_path: &RemotePath) -> PathBuf {
        remote_path.with_base(&self.workdir)
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -75,10 +75,7 @@
 use std::{
    collections::{hash_map::Entry, HashMap},
    convert::TryInto,
-    sync::{
-        atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
-        RwLock, RwLockReadGuard, RwLockWriteGuard, TryLockError,
-    },
+    sync::atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
 };

 use anyhow::Context;
@@ -162,7 +159,7 @@ struct Version {
 }

 struct Slot {
-    inner: RwLock<SlotInner>,
+    inner: tokio::sync::RwLock<SlotInner>,
    usage_count: AtomicU8,
 }

@@ -203,6 +200,11 @@ impl Slot {
            Err(usage_count) => usage_count,
        }
    }
+
+    /// Sets the usage count to a specific value.
+    fn set_usage_count(&self, count: u8) {
+        self.usage_count.store(count, Ordering::Relaxed);
+    }
 }

 pub struct PageCache {
@@ -215,9 +217,9 @@ pub struct PageCache {
    ///
    /// If you add support for caching different kinds of objects, each object kind
    /// can have a separate mapping map, next to this field.
-    materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
+    materialized_page_map: std::sync::RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,

-    immutable_page_map: RwLock<HashMap<(FileId, u32), usize>>,
+    immutable_page_map: std::sync::RwLock<HashMap<(FileId, u32), usize>>,

    /// The actual buffers with their metadata.
    slots: Box<[Slot]>,
@@ -233,7 +235,7 @@ pub struct PageCache {
 /// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
 /// until the guard is dropped.
 ///
-pub struct PageReadGuard<'i>(RwLockReadGuard<'i, SlotInner>);
+pub struct PageReadGuard<'i>(tokio::sync::RwLockReadGuard<'i, SlotInner>);

 impl std::ops::Deref for PageReadGuard<'_> {
    type Target = [u8; PAGE_SZ];
@@ -260,9 +262,10 @@ impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
 /// to initialize.
 ///
 pub struct PageWriteGuard<'i> {
-    inner: RwLockWriteGuard<'i, SlotInner>,
+    inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,

    // Are the page contents currently valid?
+    // Used to mark pages as invalid that are assigned but not yet filled with data.
    valid: bool,
 }

@@ -337,7 +340,7 @@ impl PageCache {
    /// The 'lsn' is an upper bound, this will return the latest version of
    /// the given block, but not newer than 'lsn'. Returns the actual LSN of the
    /// returned page.
-    pub fn lookup_materialized_page(
+    pub async fn lookup_materialized_page(
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
@@ -357,7 +360,7 @@ impl PageCache {
            lsn,
        };

-        if let Some(guard) = self.try_lock_for_read(&mut cache_key) {
+        if let Some(guard) = self.try_lock_for_read(&mut cache_key).await {
            if let CacheKey::MaterializedPage {
                hash_key: _,
                lsn: available_lsn,
@@ -384,7 +387,7 @@ impl PageCache {
    ///
    /// Store an image of the given page in the cache.
    ///
-    pub fn memorize_materialized_page(
+    pub async fn memorize_materialized_page(
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
@@ -401,7 +404,7 @@ impl PageCache {
            lsn,
        };

-        match self.lock_for_write(&cache_key)? {
+        match self.lock_for_write(&cache_key).await? {
            WriteBufResult::Found(write_guard) => {
                // We already had it in cache. Another thread must've put it there
                // concurrently. Check that it had the same contents that we
@@ -419,31 +422,14 @@ impl PageCache {

    // Section 1.2: Public interface functions for working with immutable file pages.

-    pub fn read_immutable_buf(&self, file_id: FileId, blkno: u32) -> anyhow::Result<ReadBufResult> {
+    pub async fn read_immutable_buf(
+        &self,
+        file_id: FileId,
+        blkno: u32,
+    ) -> anyhow::Result<ReadBufResult> {
        let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };

-        self.lock_for_read(&mut cache_key)
-    }
-
-    /// Immediately drop all buffers belonging to given file
-    pub fn drop_buffers_for_immutable(&self, drop_file_id: FileId) {
-        for slot_idx in 0..self.slots.len() {
-            let slot = &self.slots[slot_idx];
-
-            let mut inner = slot.inner.write().unwrap();
-            if let Some(key) = &inner.key {
-                match key {
-                    CacheKey::ImmutableFilePage { file_id, blkno: _ }
-                        if *file_id == drop_file_id =>
-                    {
-                        // remove mapping for old buffer
-                        self.remove_mapping(key);
-                        inner.key = None;
-                    }
-                    _ => {}
-                }
-            }
-        }
+        self.lock_for_read(&mut cache_key).await
    }

    //
@@ -463,14 +449,14 @@ impl PageCache {
    ///
    /// If no page is found, returns None and *cache_key is left unmodified.
    ///
-    fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
+    async fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
        let cache_key_orig = cache_key.clone();
        if let Some(slot_idx) = self.search_mapping(cache_key) {
            // The page was found in the mapping. Lock the slot, and re-check
            // that it's still what we expected (because we released the mapping
            // lock already, another thread could have evicted the page)
            let slot = &self.slots[slot_idx];
-            let inner = slot.inner.read().unwrap();
+            let inner = slot.inner.read().await;
            if inner.key.as_ref() == Some(cache_key) {
                slot.inc_usage_count();
                return Some(PageReadGuard(inner));
@@ -511,7 +497,7 @@ impl PageCache {
    /// }
    /// ```
    ///
-    fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
+    async fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
        let (read_access, hit) = match cache_key {
            CacheKey::MaterializedPage { .. } => {
                unreachable!("Materialized pages use lookup_materialized_page")
@@ -526,7 +512,7 @@ impl PageCache {
        let mut is_first_iteration = true;
        loop {
            // First check if the key already exists in the cache.
-            if let Some(read_guard) = self.try_lock_for_read(cache_key) {
+            if let Some(read_guard) = self.try_lock_for_read(cache_key).await {
                if is_first_iteration {
                    hit.inc();
                }
@@ -556,7 +542,7 @@ impl PageCache {
            // Make the slot ready
            let slot = &self.slots[slot_idx];
            inner.key = Some(cache_key.clone());
-            slot.usage_count.store(1, Ordering::Relaxed);
+            slot.set_usage_count(1);

            return Ok(ReadBufResult::NotFound(PageWriteGuard {
                inner,
@@ -569,13 +555,13 @@ impl PageCache {
    /// found, returns None.
    ///
    /// When locking a page for writing, the search criteria is always "exact".
-    fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
+    async fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
        if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
            // The page was found in the mapping. Lock the slot, and re-check
            // that it's still what we expected (because we don't released the mapping
            // lock already, another thread could have evicted the page)
            let slot = &self.slots[slot_idx];
-            let inner = slot.inner.write().unwrap();
+            let inner = slot.inner.write().await;
            if inner.key.as_ref() == Some(cache_key) {
                slot.inc_usage_count();
                return Some(PageWriteGuard { inner, valid: true });
@@ -588,10 +574,10 @@ impl PageCache {
    ///
    /// Similar to lock_for_read(), but the returned buffer is write-locked and
    /// may be modified by the caller even if it's already found in the cache.
-    fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
+    async fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
        loop {
            // First check if the key already exists in the cache.
-            if let Some(write_guard) = self.try_lock_for_write(cache_key) {
+            if let Some(write_guard) = self.try_lock_for_write(cache_key).await {
                return Ok(WriteBufResult::Found(write_guard));
            }

@@ -617,7 +603,7 @@ impl PageCache {
            // Make the slot ready
            let slot = &self.slots[slot_idx];
            inner.key = Some(cache_key.clone());
-            slot.usage_count.store(1, Ordering::Relaxed);
+            slot.set_usage_count(1);

            return Ok(WriteBufResult::NotFound(PageWriteGuard {
                inner,
@@ -772,7 +758,7 @@ impl PageCache {
    /// Find a slot to evict.
    ///
    /// On return, the slot is empty and write-locked.
-    fn find_victim(&self) -> anyhow::Result<(usize, RwLockWriteGuard<SlotInner>)> {
+    fn find_victim(&self) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
        let iter_limit = self.slots.len() * 10;
        let mut iters = 0;
        loop {
@@ -784,10 +770,7 @@ impl PageCache {
            if slot.dec_usage_count() == 0 {
                let mut inner = match slot.inner.try_write() {
                    Ok(inner) => inner,
-                    Err(TryLockError::Poisoned(err)) => {
-                        anyhow::bail!("buffer lock was poisoned: {err:?}")
-                    }
-                    Err(TryLockError::WouldBlock) => {
+                    Err(_err) => {
                        // If we have looped through the whole buffer pool 10 times
                        // and still haven't found a victim buffer, something's wrong.
                        // Maybe all the buffers were in locked. That could happen in
@@ -816,6 +799,8 @@ impl PageCache {
    fn new(num_pages: usize) -> Self {
        assert!(num_pages > 0, "page cache size must be > 0");

+        // We use Box::leak here and into_boxed_slice to avoid leaking uninitialized
+        // memory that Vec's might contain.
        let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());

        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
@@ -829,7 +814,7 @@ impl PageCache {
                let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();

                Slot {
-                    inner: RwLock::new(SlotInner { key: None, buf }),
+                    inner: tokio::sync::RwLock::new(SlotInner { key: None, buf }),
                    usage_count: AtomicU8::new(0),
                }
            })
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -85,6 +85,7 @@ pub use pageserver_api::models::TenantState;
 use toml_edit;
 use utils::{
    crashsafe,
+    generation::Generation,
    id::{TenantId, TimelineId},
    lsn::{Lsn, RecordLsn},
 };
@@ -178,6 +179,11 @@ pub struct Tenant {
    tenant_conf: Arc<RwLock<TenantConfOpt>>,

    tenant_id: TenantId,
+
+    /// The remote storage generation, used to protect S3 objects from split-brain.
+    /// Does not change over the lifetime of the [`Tenant`] object.
+    generation: Generation,
+
    timelines: Mutex<HashMap<TimelineId, Arc<Timeline>>>,
    // This mutex prevents creation of new timelines during GC.
    // Adding yet another mutex (in addition to `timelines`) is needed because holding
@@ -422,13 +428,53 @@ impl Tenant {
            init_order,
            CreateTimelineCause::Load,
        )?;
-        let new_disk_consistent_lsn = timeline.get_disk_consistent_lsn();
+        let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
        anyhow::ensure!(
-            new_disk_consistent_lsn.is_valid(),
+            disk_consistent_lsn.is_valid(),
            "Timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
        );
+        assert_eq!(
+            disk_consistent_lsn,
+            up_to_date_metadata.disk_consistent_lsn(),
+            "these are used interchangeably"
+        );
+
+        // Save the metadata file to local disk.
+        if !picked_local {
+            save_metadata(
+                self.conf,
+                &tenant_id,
+                &timeline_id,
+                up_to_date_metadata,
+                first_save,
+            )
+            .context("save_metadata")?;
+        }
+
+        let index_part = remote_startup_data.as_ref().map(|x| &x.index_part);
+
+        if let Some(index_part) = index_part {
+            timeline
+                .remote_client
+                .as_ref()
+                .unwrap()
+                .init_upload_queue(index_part)?;
+        } else if self.remote_storage.is_some() {
+            // No data on the remote storage, but we have local metadata file. We can end up
+            // here with timeline_create being interrupted before finishing index part upload.
+            // By doing what we do here, the index part upload is retried.
+            // If control plane retries timeline creation in the meantime, the mgmt API handler
+            // for timeline creation will coalesce on the upload we queue here.
+            let rtc = timeline.remote_client.as_ref().unwrap();
+            rtc.init_upload_queue_for_empty_remote(up_to_date_metadata)?;
+            rtc.schedule_index_upload_for_metadata_update(up_to_date_metadata)?;
+        }
+
        timeline
-            .load_layer_map(new_disk_consistent_lsn)
+            .load_layer_map(
+                disk_consistent_lsn,
+                remote_startup_data.map(|x| x.index_part),
+            )
            .await
            .with_context(|| {
                format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}")
@@ -452,19 +498,6 @@ impl Tenant {
            }
        };

-        if self.remote_storage.is_some() {
-            // Reconcile local state with remote storage, downloading anything that's
-            // missing locally, and scheduling uploads for anything that's missing
-            // in remote storage.
-            timeline
-                .reconcile_with_remote(
-                    up_to_date_metadata,
-                    remote_startup_data.as_ref().map(|r| &r.index_part),
-                )
-                .await
-                .context("failed to reconcile with remote")?
-        }
-
        // Sanity check: a timeline should have some content.
        anyhow::ensure!(
            ancestor.is_some()
@@ -479,18 +512,6 @@ impl Tenant {
            "Timeline has no ancestor and no layer files"
        );

-        // Save the metadata file to local disk.
-        if !picked_local {
-            save_metadata(
-                self.conf,
-                &tenant_id,
-                &timeline_id,
-                up_to_date_metadata,
-                first_save,
-            )
-            .context("save_metadata")?;
-        }
-
        Ok(())
    }

@@ -507,6 +528,7 @@ impl Tenant {
    pub(crate) fn spawn_attach(
        conf: &'static PageServerConf,
        tenant_id: TenantId,
+        generation: Generation,
        broker_client: storage_broker::BrokerClientChannel,
        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        remote_storage: GenericRemoteStorage,
@@ -523,6 +545,7 @@ impl Tenant {
            tenant_conf,
            wal_redo_manager,
            tenant_id,
+            generation,
            Some(remote_storage.clone()),
        ));

@@ -633,12 +656,8 @@ impl Tenant {
            .as_ref()
            .ok_or_else(|| anyhow::anyhow!("cannot attach without remote storage"))?;

-        let remote_timeline_ids = remote_timeline_client::list_remote_timelines(
-            remote_storage,
-            self.conf,
-            self.tenant_id,
-        )
-        .await?;
+        let remote_timeline_ids =
+            remote_timeline_client::list_remote_timelines(remote_storage, self.tenant_id).await?;

        info!("found {} timelines", remote_timeline_ids.len());

@@ -650,6 +669,7 @@ impl Tenant {
                self.conf,
                self.tenant_id,
                timeline_id,
+                self.generation,
            );
            part_downloads.spawn(
                async move {
@@ -683,10 +703,7 @@ impl Tenant {
            debug!("successfully downloaded index part for timeline {timeline_id}");
            match index_part {
                MaybeDeletedIndexPart::IndexPart(index_part) => {
-                    timeline_ancestors.insert(
-                        timeline_id,
-                        index_part.parse_metadata().context("parse_metadata")?,
-                    );
+                    timeline_ancestors.insert(timeline_id, index_part.metadata.clone());
                    remote_index_and_client.insert(timeline_id, (index_part, client));
                }
                MaybeDeletedIndexPart::Deleted(index_part) => {
@@ -737,7 +754,7 @@ impl Tenant {
            DeleteTimelineFlow::resume_deletion(
                Arc::clone(self),
                timeline_id,
-                &index_part.parse_metadata().context("parse_metadata")?,
+                &index_part.metadata,
                Some(remote_timeline_client),
                None,
            )
@@ -839,6 +856,7 @@ impl Tenant {
            TenantConfOpt::default(),
            wal_redo_manager,
            tenant_id,
+            Generation::broken(),
            None,
        ))
    }
@@ -856,6 +874,7 @@ impl Tenant {
    pub(crate) fn spawn_load(
        conf: &'static PageServerConf,
        tenant_id: TenantId,
+        generation: Generation,
        resources: TenantSharedResources,
        init_order: Option<InitializationOrder>,
        tenants: &'static tokio::sync::RwLock<TenantsMap>,
@@ -881,6 +900,7 @@ impl Tenant {
            tenant_conf,
            wal_redo_manager,
            tenant_id,
+            generation,
            remote_storage.clone(),
        );
        let tenant = Arc::new(tenant);
@@ -1299,10 +1319,7 @@ impl Tenant {
                        }
                    };

-                    let remote_metadata = index_part
-                        .parse_metadata()
-                        .context("parse_metadata")
-                        .map_err(LoadLocalTimelineError::Load)?;
+                    let remote_metadata = index_part.metadata.clone();
                    (
                        Some(RemoteStartupData {
                            index_part,
@@ -2265,6 +2282,7 @@ impl Tenant {
            ancestor,
            new_timeline_id,
            self.tenant_id,
+            self.generation,
            Arc::clone(&self.walredo_mgr),
            resources,
            pg_version,
@@ -2282,6 +2300,7 @@ impl Tenant {
        tenant_conf: TenantConfOpt,
        walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
        tenant_id: TenantId,
+        generation: Generation,
        remote_storage: Option<GenericRemoteStorage>,
    ) -> Tenant {
        let (state, mut rx) = watch::channel(state);
@@ -2340,6 +2359,7 @@ impl Tenant {

        Tenant {
            tenant_id,
+            generation,
            conf,
            // using now here is good enough approximation to catch tenants with really long
            // activation times.
@@ -2922,6 +2942,7 @@ impl Tenant {
                self.conf,
                self.tenant_id,
                timeline_id,
+                self.generation,
            );
            Some(remote_client)
        } else {
@@ -3445,6 +3466,7 @@ pub mod harness {
        pub conf: &'static PageServerConf,
        pub tenant_conf: TenantConf,
        pub tenant_id: TenantId,
+        pub generation: Generation,
    }

    static LOG_HANDLE: OnceCell<()> = OnceCell::new();
@@ -3486,6 +3508,7 @@ pub mod harness {
                conf,
                tenant_conf,
                tenant_id,
+                generation: Generation::new(0xdeadbeef),
            })
        }

@@ -3512,6 +3535,7 @@ pub mod harness {
                TenantConfOpt::from(self.tenant_conf),
                walredo_mgr,
                self.tenant_id,
+                self.generation,
                remote_storage,
            ));
            tenant
@@ -4092,7 +4116,7 @@ mod tests {
        let mut found_error_message = false;
        let mut err_source = err.source();
        while let Some(source) = err_source {
-            if source.to_string() == "metadata checksum mismatch" {
+            if source.to_string().contains("metadata checksum mismatch") {
                found_error_message = true;
                break;
            }
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -12,14 +12,11 @@
 //! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
 use crate::page_cache::PAGE_SZ;
-use crate::tenant::block_io::{BlockCursor, BlockReader};
+use crate::tenant::block_io::BlockCursor;
 use std::cmp::min;
 use std::io::{Error, ErrorKind};

-impl<R> BlockCursor<R>
-where
-    R: BlockReader,
-{
+impl<'a> BlockCursor<'a> {
    /// Read a blob into a new buffer.
    pub async fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
        let mut buf = Vec::new();
@@ -36,7 +33,7 @@ where
        let mut blknum = (offset / PAGE_SZ as u64) as u32;
        let mut off = (offset % PAGE_SZ as u64) as usize;

-        let mut buf = self.read_blk(blknum)?;
+        let mut buf = self.read_blk(blknum).await?;

        // peek at the first byte, to determine if it's a 1- or 4-byte length
        let first_len_byte = buf[off];
@@ -52,7 +49,7 @@ where
                // it is split across two pages
                len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]);
                blknum += 1;
-                buf = self.read_blk(blknum)?;
+                buf = self.read_blk(blknum).await?;
                len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]);
                off = 4 - thislen;
            } else {
@@ -73,7 +70,7 @@ where
            if page_remain == 0 {
                // continue on next page
                blknum += 1;
-                buf = self.read_blk(blknum)?;
+                buf = self.read_blk(blknum).await?;
                off = 0;
                page_remain = PAGE_SZ;
            }
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -2,8 +2,12 @@
 //! Low-level Block-oriented I/O functions
 //!

+use super::ephemeral_file::EphemeralFile;
+use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
+use crate::virtual_file::VirtualFile;
 use bytes::Bytes;
+use std::fs::File;
 use std::ops::{Deref, DerefMut};
 use std::os::unix::fs::FileExt;

@@ -13,32 +17,20 @@ use std::os::unix::fs::FileExt;
 /// There are currently two implementations: EphemeralFile, and FileBlockReader
 /// below.
 pub trait BlockReader {
-    ///
-    /// Read a block. Returns a "lease" object that can be used to
-    /// access to the contents of the page. (For the page cache, the
-    /// lease object represents a lock on the buffer.)
-    ///
-    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error>;
-
    ///
    /// Create a new "cursor" for reading from this reader.
    ///
    /// A cursor caches the last accessed page, allowing for faster
    /// access if the same block is accessed repeatedly.
-    fn block_cursor(&self) -> BlockCursor<&Self>
-    where
-        Self: Sized,
-    {
-        BlockCursor::new(self)
-    }
+    fn block_cursor(&self) -> BlockCursor<'_>;
 }

 impl<B> BlockReader for &B
 where
    B: BlockReader,
 {
-    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
-        (*self).read_blk(blknum)
+    fn block_cursor(&self) -> BlockCursor<'_> {
+        (*self).block_cursor()
    }
 }

@@ -47,7 +39,7 @@ pub enum BlockLease<'a> {
    PageReadGuard(PageReadGuard<'static>),
    EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
    #[cfg(test)]
-    Rc(std::rc::Rc<[u8; PAGE_SZ]>),
+    Arc(std::sync::Arc<[u8; PAGE_SZ]>),
 }

 impl From<PageReadGuard<'static>> for BlockLease<'static> {
@@ -57,9 +49,9 @@ impl From<PageReadGuard<'static>> for BlockLease<'static> {
 }

 #[cfg(test)]
-impl<'a> From<std::rc::Rc<[u8; PAGE_SZ]>> for BlockLease<'a> {
-    fn from(value: std::rc::Rc<[u8; PAGE_SZ]>) -> Self {
-        BlockLease::Rc(value)
+impl<'a> From<std::sync::Arc<[u8; PAGE_SZ]>> for BlockLease<'a> {
+    fn from(value: std::sync::Arc<[u8; PAGE_SZ]>) -> Self {
+        BlockLease::Arc(value)
    }
 }

@@ -71,7 +63,35 @@ impl<'a> Deref for BlockLease<'a> {
            BlockLease::PageReadGuard(v) => v.deref(),
            BlockLease::EphemeralFileMutableTail(v) => v,
            #[cfg(test)]
-            BlockLease::Rc(v) => v.deref(),
+            BlockLease::Arc(v) => v.deref(),
+        }
+    }
+}
+
+/// Provides the ability to read blocks from different sources,
+/// similar to using traits for this purpose.
+///
+/// Unlike traits, we also support the read function to be async though.
+pub(crate) enum BlockReaderRef<'a> {
+    FileBlockReaderVirtual(&'a FileBlockReader<VirtualFile>),
+    FileBlockReaderFile(&'a FileBlockReader<std::fs::File>),
+    EphemeralFile(&'a EphemeralFile),
+    Adapter(Adapter<&'a DeltaLayerInner>),
+    #[cfg(test)]
+    TestDisk(&'a super::disk_btree::tests::TestDisk),
+}
+
+impl<'a> BlockReaderRef<'a> {
+    #[inline(always)]
+    async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+        use BlockReaderRef::*;
+        match self {
+            FileBlockReaderVirtual(r) => r.read_blk(blknum).await,
+            FileBlockReaderFile(r) => r.read_blk(blknum).await,
+            EphemeralFile(r) => r.read_blk(blknum).await,
+            Adapter(r) => r.read_blk(blknum).await,
+            #[cfg(test)]
+            TestDisk(r) => r.read_blk(blknum),
        }
    }
 }
@@ -93,23 +113,29 @@ impl<'a> Deref for BlockLease<'a> {
 /// // do stuff with 'buf'
 /// ```
 ///
-pub struct BlockCursor<R>
-where
-    R: BlockReader,
-{
-    reader: R,
+pub struct BlockCursor<'a> {
+    reader: BlockReaderRef<'a>,
 }

-impl<R> BlockCursor<R>
-where
-    R: BlockReader,
-{
-    pub fn new(reader: R) -> Self {
+impl<'a> BlockCursor<'a> {
+    pub(crate) fn new(reader: BlockReaderRef<'a>) -> Self {
        BlockCursor { reader }
    }
+    // Needed by cli
+    pub fn new_fileblockreader_virtual(reader: &'a FileBlockReader<VirtualFile>) -> Self {
+        BlockCursor {
+            reader: BlockReaderRef::FileBlockReaderVirtual(reader),
+        }
+    }

-    pub fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
-        self.reader.read_blk(blknum)
+    /// Read a block.
+    ///
+    /// Returns a "lease" object that can be used to
+    /// access to the contents of the page. (For the page cache, the
+    /// lease object represents a lock on the buffer.)
+    #[inline(always)]
+    pub async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+        self.reader.read_blk(blknum).await
    }
 }

@@ -139,17 +165,17 @@ where
        assert!(buf.len() == PAGE_SZ);
        self.file.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
    }
-}
-
-impl<F> BlockReader for FileBlockReader<F>
-where
-    F: FileExt,
-{
-    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+    /// Read a block.
+    ///
+    /// Returns a "lease" object that can be used to
+    /// access to the contents of the page. (For the page cache, the
+    /// lease object represents a lock on the buffer.)
+    pub async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        let cache = page_cache::get();
        loop {
            match cache
                .read_immutable_buf(self.file_id, blknum)
+                .await
                .map_err(|e| {
                    std::io::Error::new(
                        std::io::ErrorKind::Other,
@@ -170,6 +196,18 @@ where
    }
 }

+impl BlockReader for FileBlockReader<File> {
+    fn block_cursor(&self) -> BlockCursor<'_> {
+        BlockCursor::new(BlockReaderRef::FileBlockReaderFile(self))
+    }
+}
+
+impl BlockReader for FileBlockReader<VirtualFile> {
+    fn block_cursor(&self) -> BlockCursor<'_> {
+        BlockCursor::new(BlockReaderRef::FileBlockReaderVirtual(self))
+    }
+}
+
 ///
 /// Trait for block-oriented output
 ///
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -7,6 +7,7 @@ use anyhow::Context;
 use pageserver_api::models::TenantState;
 use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use tokio::sync::OwnedMutexGuard;
+use tokio_util::sync::CancellationToken;
 use tracing::{error, info, instrument, warn, Instrument, Span};

 use utils::{
@@ -82,6 +83,8 @@ async fn create_remote_delete_mark(
        FAILED_UPLOAD_WARN_THRESHOLD,
        FAILED_REMOTE_OP_RETRIES,
        "mark_upload",
+        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+        backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
    )
    .await
    .context("mark_upload")?;
@@ -171,6 +174,8 @@ async fn remove_tenant_remote_delete_mark(
            FAILED_UPLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "remove_tenant_remote_delete_mark",
+            // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
        )
        .await
        .context("remove_tenant_remote_delete_mark")?;
@@ -252,6 +257,8 @@ pub(crate) async fn remote_delete_mark_exists(
        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
        "fetch_tenant_deletion_mark",
+        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+        backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
    )
    .await;

--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -259,9 +259,10 @@ where
    {
        let mut stack = Vec::new();
        stack.push((self.root_blk, None));
+        let block_cursor = self.reader.block_cursor();
        while let Some((node_blknum, opt_iter)) = stack.pop() {
            // Locate the node.
-            let node_buf = self.reader.read_blk(self.start_blk + node_blknum)?;
+            let node_buf = block_cursor.read_blk(self.start_blk + node_blknum).await?;

            let node = OnDiskNode::deparse(node_buf.as_ref())?;
            let prefix_len = node.prefix_len as usize;
@@ -353,8 +354,10 @@ where

        stack.push((self.root_blk, String::new(), 0, 0, 0));

+        let block_cursor = self.reader.block_cursor();
+
        while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
-            let blk = self.reader.read_blk(self.start_blk + blknum)?;
+            let blk = block_cursor.read_blk(self.start_blk + blknum).await?;
            let buf: &[u8] = blk.as_ref();
            let node = OnDiskNode::<L>::deparse(buf)?;

@@ -683,27 +686,30 @@ impl<const L: usize> BuildNode<L> {
 }

 #[cfg(test)]
-mod tests {
+pub(crate) mod tests {
    use super::*;
-    use crate::tenant::block_io::BlockLease;
+    use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};
    use rand::Rng;
    use std::collections::BTreeMap;
    use std::sync::atomic::{AtomicUsize, Ordering};

    #[derive(Clone, Default)]
-    struct TestDisk {
+    pub(crate) struct TestDisk {
        blocks: Vec<Bytes>,
    }
    impl TestDisk {
        fn new() -> Self {
            Self::default()
        }
-    }
-    impl BlockReader for TestDisk {
-        fn read_blk(&self, blknum: u32) -> io::Result<BlockLease> {
+        pub(crate) fn read_blk(&self, blknum: u32) -> io::Result<BlockLease> {
            let mut buf = [0u8; PAGE_SZ];
            buf.copy_from_slice(&self.blocks[blknum as usize]);
-            Ok(std::rc::Rc::new(buf).into())
+            Ok(std::sync::Arc::new(buf).into())
+        }
+    }
+    impl BlockReader for TestDisk {
+        fn block_cursor(&self) -> BlockCursor<'_> {
+            BlockCursor::new(BlockReaderRef::TestDisk(self))
        }
    }
    impl BlockWriter for &mut TestDisk {
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -3,8 +3,7 @@

 use crate::config::PageServerConf;
 use crate::page_cache::{self, PAGE_SZ};
-use crate::tenant::blob_io::BlobWriter;
-use crate::tenant::block_io::{BlockLease, BlockReader};
+use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
 use crate::virtual_file::VirtualFile;
 use std::cmp::min;
 use std::fs::OpenOptions;
@@ -22,7 +21,7 @@ pub struct EphemeralFile {
    _tenant_id: TenantId,
    _timeline_id: TimelineId,
    file: VirtualFile,
-    size: u64,
+    len: u64,
    /// An ephemeral file is append-only.
    /// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
    /// The other pages, which can no longer be modified, are accessed through the page cache.
@@ -53,27 +52,57 @@ impl EphemeralFile {
            _tenant_id: tenant_id,
            _timeline_id: timeline_id,
            file,
-            size: 0,
+            len: 0,
            mutable_tail: [0u8; PAGE_SZ],
        })
    }

-    pub(crate) fn size(&self) -> u64 {
-        self.size
+    pub(crate) fn len(&self) -> u64 {
+        self.len
    }
-}

-/// Does the given filename look like an ephemeral file?
-pub fn is_ephemeral_file(filename: &str) -> bool {
-    if let Some(rest) = filename.strip_prefix("ephemeral-") {
-        rest.parse::<u32>().is_ok()
-    } else {
-        false
+    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
+        let flushed_blknums = 0..self.len / PAGE_SZ as u64;
+        if flushed_blknums.contains(&(blknum as u64)) {
+            let cache = page_cache::get();
+            loop {
+                match cache
+                    .read_immutable_buf(self.page_cache_file_id, blknum)
+                    .await
+                    .map_err(|e| {
+                        std::io::Error::new(
+                            std::io::ErrorKind::Other,
+                            // order path before error because error is anyhow::Error => might have many contexts
+                            format!(
+                                "ephemeral file: read immutable page #{}: {}: {:#}",
+                                blknum,
+                                self.file.path.display(),
+                                e,
+                            ),
+                        )
+                    })? {
+                    page_cache::ReadBufResult::Found(guard) => {
+                        return Ok(BlockLease::PageReadGuard(guard))
+                    }
+                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
+                        let buf: &mut [u8] = write_guard.deref_mut();
+                        debug_assert_eq!(buf.len(), PAGE_SZ);
+                        self.file
+                            .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)?;
+                        write_guard.mark_valid();
+
+                        // Swap for read lock
+                        continue;
+                    }
+                };
+            }
+        } else {
+            debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
+            Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
+        }
    }
-}

-impl BlobWriter for EphemeralFile {
-    fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
+    pub(crate) async fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
        struct Writer<'a> {
            ephemeral_file: &'a mut EphemeralFile,
            /// The block to which the next [`push_bytes`] will write.
@@ -84,13 +113,13 @@ impl BlobWriter for EphemeralFile {
        impl<'a> Writer<'a> {
            fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
                Ok(Writer {
-                    blknum: (ephemeral_file.size / PAGE_SZ as u64) as u32,
-                    off: (ephemeral_file.size % PAGE_SZ as u64) as usize,
+                    blknum: (ephemeral_file.len / PAGE_SZ as u64) as u32,
+                    off: (ephemeral_file.len % PAGE_SZ as u64) as usize,
                    ephemeral_file,
                })
            }
            #[inline(always)]
-            fn push_bytes(&mut self, src: &[u8]) -> Result<(), io::Error> {
+            async fn push_bytes(&mut self, src: &[u8]) -> Result<(), io::Error> {
                let mut src_remaining = src;
                while !src_remaining.is_empty() {
                    let dst_remaining = &mut self.ephemeral_file.mutable_tail[self.off..];
@@ -107,10 +136,13 @@ impl BlobWriter for EphemeralFile {
                                // Pre-warm the page cache with what we just wrote.
                                // This isn't necessary for coherency/correctness, but it's how we've always done it.
                                let cache = page_cache::get();
-                                match cache.read_immutable_buf(
-                                    self.ephemeral_file.page_cache_file_id,
-                                    self.blknum,
-                                ) {
+                                match cache
+                                    .read_immutable_buf(
+                                        self.ephemeral_file.page_cache_file_id,
+                                        self.blknum,
+                                    )
+                                    .await
+                                {
                                    Ok(page_cache::ReadBufResult::Found(_guard)) => {
                                        // This function takes &mut self, so, it shouldn't be possible to reach this point.
                                        unreachable!("we just wrote blknum {} and this function takes &mut self, so, no concurrent read_blk is possible", self.blknum);
@@ -154,39 +186,47 @@ impl BlobWriter for EphemeralFile {
            }
        }

-        let pos = self.size;
+        let pos = self.len;
        let mut writer = Writer::new(self)?;

        // Write the length field
        if srcbuf.len() < 0x80 {
            // short one-byte length header
            let len_buf = [srcbuf.len() as u8];
-            writer.push_bytes(&len_buf)?;
+            writer.push_bytes(&len_buf).await?;
        } else {
            let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
            len_buf[0] |= 0x80;
-            writer.push_bytes(&len_buf)?;
+            writer.push_bytes(&len_buf).await?;
        }

        // Write the payload
-        writer.push_bytes(srcbuf)?;
+        writer.push_bytes(srcbuf).await?;

        if srcbuf.len() < 0x80 {
-            self.size += 1;
+            self.len += 1;
        } else {
-            self.size += 4;
+            self.len += 4;
        }
-        self.size += srcbuf.len() as u64;
+        self.len += srcbuf.len() as u64;

        Ok(pos)
    }
 }

+/// Does the given filename look like an ephemeral file?
+pub fn is_ephemeral_file(filename: &str) -> bool {
+    if let Some(rest) = filename.strip_prefix("ephemeral-") {
+        rest.parse::<u32>().is_ok()
+    } else {
+        false
+    }
+}
+
 impl Drop for EphemeralFile {
    fn drop(&mut self) {
-        // drop all pages from page cache
-        let cache = page_cache::get();
-        cache.drop_buffers_for_immutable(self.page_cache_file_id);
+        // There might still be pages in the [`crate::page_cache`] for this file.
+        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.

        // unlink the file
        let res = std::fs::remove_file(&self.file.path);
@@ -207,52 +247,15 @@ impl Drop for EphemeralFile {
 }

 impl BlockReader for EphemeralFile {
-    fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
-        let flushed_blknums = 0..self.size / PAGE_SZ as u64;
-        if flushed_blknums.contains(&(blknum as u64)) {
-            let cache = page_cache::get();
-            loop {
-                match cache
-                    .read_immutable_buf(self.page_cache_file_id, blknum)
-                    .map_err(|e| {
-                        std::io::Error::new(
-                            std::io::ErrorKind::Other,
-                            // order path before error because error is anyhow::Error => might have many contexts
-                            format!(
-                                "ephemeral file: read immutable page #{}: {}: {:#}",
-                                blknum,
-                                self.file.path.display(),
-                                e,
-                            ),
-                        )
-                    })? {
-                    page_cache::ReadBufResult::Found(guard) => {
-                        return Ok(BlockLease::PageReadGuard(guard))
-                    }
-                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                        let buf: &mut [u8] = write_guard.deref_mut();
-                        debug_assert_eq!(buf.len(), PAGE_SZ);
-                        self.file
-                            .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)?;
-                        write_guard.mark_valid();
-
-                        // Swap for read lock
-                        continue;
-                    }
-                };
-            }
-        } else {
-            debug_assert_eq!(blknum as u64, self.size / PAGE_SZ as u64);
-            Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
-        }
+    fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
+        BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
    }
 }

 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::tenant::blob_io::BlobWriter;
-    use crate::tenant::block_io::BlockCursor;
+    use crate::tenant::block_io::{BlockCursor, BlockReaderRef};
    use rand::{thread_rng, RngCore};
    use std::fs;
    use std::str::FromStr;
@@ -280,12 +283,12 @@ mod tests {

        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;

-        let pos_foo = file.write_blob(b"foo")?;
+        let pos_foo = file.write_blob(b"foo").await?;
        assert_eq!(
            b"foo",
            file.block_cursor().read_blob(pos_foo).await?.as_slice()
        );
-        let pos_bar = file.write_blob(b"bar")?;
+        let pos_bar = file.write_blob(b"bar").await?;
        assert_eq!(
            b"foo",
            file.block_cursor().read_blob(pos_foo).await?.as_slice()
@@ -298,17 +301,17 @@ mod tests {
        let mut blobs = Vec::new();
        for i in 0..10000 {
            let data = Vec::from(format!("blob{}", i).as_bytes());
-            let pos = file.write_blob(&data)?;
+            let pos = file.write_blob(&data).await?;
            blobs.push((pos, data));
        }
        // also test with a large blobs
        for i in 0..100 {
            let data = format!("blob{}", i).as_bytes().repeat(100);
-            let pos = file.write_blob(&data)?;
+            let pos = file.write_blob(&data).await?;
            blobs.push((pos, data));
        }

-        let cursor = BlockCursor::new(&file);
+        let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file));
        for (pos, expected) in blobs {
            let actual = cursor.read_blob(pos).await?;
            assert_eq!(actual, expected);
@@ -318,7 +321,7 @@ mod tests {
        let mut large_data = Vec::new();
        large_data.resize(20000, 0);
        thread_rng().fill_bytes(&mut large_data);
-        let pos_large = file.write_blob(&large_data)?;
+        let pos_large = file.write_blob(&large_data).await?;
        let result = file.block_cursor().read_blob(pos_large).await?;
        assert_eq!(result, large_data);

--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -12,7 +12,7 @@ use std::fs::{File, OpenOptions};
 use std::io::{self, Write};

 use anyhow::{bail, ensure, Context};
-use serde::{Deserialize, Serialize};
+use serde::{de::Error, Deserialize, Serialize, Serializer};
 use thiserror::Error;
 use tracing::info_span;
 use utils::bin_ser::SerializeError;
@@ -232,6 +232,28 @@ impl TimelineMetadata {
    }
 }

+impl<'de> Deserialize<'de> for TimelineMetadata {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        let bytes = Vec::<u8>::deserialize(deserializer)?;
+        Self::from_bytes(bytes.as_slice()).map_err(|e| D::Error::custom(format!("{e}")))
+    }
+}
+
+impl Serialize for TimelineMetadata {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        let bytes = self
+            .to_bytes()
+            .map_err(|e| serde::ser::Error::custom(format!("{e}")))?;
+        bytes.serialize(serializer)
+    }
+}
+
 /// Save timeline metadata to file
 pub fn save_metadata(
    conf: &'static PageServerConf,
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -25,6 +25,7 @@ use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantSt
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};

 use utils::fs_ext::PathExt;
+use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};

 use super::delete::DeleteTenantError;
@@ -202,6 +203,7 @@ pub(crate) fn schedule_local_tenant_processing(
            match Tenant::spawn_attach(
                conf,
                tenant_id,
+                Generation::none(),
                resources.broker_client,
                tenants,
                remote_storage,
@@ -224,7 +226,15 @@ pub(crate) fn schedule_local_tenant_processing(
    } else {
        info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
        // Start loading the tenant into memory. It will initially be in Loading state.
-        Tenant::spawn_load(conf, tenant_id, resources, init_order, tenants, ctx)
+        Tenant::spawn_load(
+            conf,
+            tenant_id,
+            Generation::none(),
+            resources,
+            init_order,
+            tenants,
+            ctx,
+        )
    };
    Ok(tenant)
 }
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -135,7 +135,7 @@
 //! - Initiate upload queue with that [`IndexPart`].
 //! - Reschedule all lost operations by comparing the local filesystem state
 //!   and remote state as per [`IndexPart`]. This is done in
-//!   [`Tenant::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
+//!   [`Tenant::timeline_init_and_sync`].
 //!
 //! Note that if we crash during file deletion between the index update
 //! that removes the file from the list of files, and deleting the remote file,
@@ -172,7 +172,6 @@
 //!   transitioning it from `TenantState::Attaching` to `TenantState::Active` state.
 //!   This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops.
 //!
-//! Most of the above steps happen in [`Timeline::reconcile_with_remote`] or its callers.
 //! We keep track of the fact that a client is in `Attaching` state in a marker
 //! file on the local disk. This is critical because, when we restart the pageserver,
 //! we do not want to do the `List timelines` step for each tenant that has already
@@ -192,14 +191,14 @@
 //! not created and the uploads are skipped.
 //! Theoretically, it should be ok to remove and re-add remote storage configuration to
 //! the pageserver config at any time, since it doesn't make a difference to
-//! `reconcile_with_remote`.
+//! [`Timeline::load_layer_map`].
 //! Of course, the remote timeline dir must not change while we have de-configured
 //! remote storage, i.e., the pageserver must remain the owner of the given prefix
 //! in remote storage.
 //! But note that we don't test any of this right now.
 //!
 //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
-//! [`Timeline::reconcile_with_remote`]: super::Timeline::reconcile_with_remote
+//! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map

 mod delete;
 mod download;
@@ -211,12 +210,13 @@ use chrono::{NaiveDateTime, Utc};
 // re-export these
 pub use download::{is_temp_download_file, list_remote_timelines};
 use scopeguard::ScopeGuard;
+use tokio_util::sync::CancellationToken;
 use utils::backoff::{
    self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };

 use std::collections::{HashMap, VecDeque};
-use std::path::Path;
+use std::path::{Path, PathBuf};
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};

@@ -231,9 +231,11 @@ use crate::metrics::{
    RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
    REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
 };
+use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::tenant::upload_queue::Delete;
+use crate::tenant::TIMELINES_SEGMENT_NAME;
 use crate::{
    config::PageServerConf,
    task_mgr,
@@ -251,6 +253,7 @@ use self::index::IndexPart;

 use super::storage_layer::LayerFileName;
 use super::upload_queue::SetDeletedFlagProgress;
+use super::Generation;

 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a download fails, we log it at info-level, and retry.
@@ -314,6 +317,7 @@ pub struct RemoteTimelineClient {

    tenant_id: TenantId,
    timeline_id: TimelineId,
+    generation: Generation,

    upload_queue: Mutex<UploadQueue>,

@@ -334,12 +338,14 @@ impl RemoteTimelineClient {
        conf: &'static PageServerConf,
        tenant_id: TenantId,
        timeline_id: TimelineId,
+        generation: Generation,
    ) -> RemoteTimelineClient {
        RemoteTimelineClient {
            conf,
            runtime: BACKGROUND_RUNTIME.handle().to_owned(),
            tenant_id,
            timeline_id,
+            generation,
            storage_impl: remote_storage,
            upload_queue: Mutex::new(UploadQueue::Uninitialized),
            metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
@@ -353,6 +359,10 @@ impl RemoteTimelineClient {
        let mut upload_queue = self.upload_queue.lock().unwrap();
        upload_queue.initialize_with_current_remote_index_part(index_part)?;
        self.update_remote_physical_size_gauge(Some(index_part));
+        info!(
+            "initialized upload queue from remote index with {} layer files",
+            index_part.layer_metadata.len()
+        );
        Ok(())
    }

@@ -365,6 +375,7 @@ impl RemoteTimelineClient {
        let mut upload_queue = self.upload_queue.lock().unwrap();
        upload_queue.initialize_empty_remote(local_metadata)?;
        self.update_remote_physical_size_gauge(None);
+        info!("initialized upload queue as empty");
        Ok(())
    }

@@ -443,10 +454,10 @@ impl RemoteTimelineClient {
        );

        let index_part = download::download_index_part(
-            self.conf,
            &self.storage_impl,
            &self.tenant_id,
            &self.timeline_id,
+            self.generation,
        )
        .measure_remote_op(
            self.tenant_id,
@@ -535,8 +546,7 @@ impl RemoteTimelineClient {
        // ahead of what's _actually_ on the remote during index upload.
        upload_queue.latest_metadata = metadata.clone();

-        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
-        self.schedule_index_upload(upload_queue, metadata_bytes);
+        self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());

        Ok(())
    }
@@ -556,8 +566,7 @@ impl RemoteTimelineClient {
        let upload_queue = guard.initialized_mut()?;

        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
-            self.schedule_index_upload(upload_queue, metadata_bytes);
+            self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
        }

        Ok(())
@@ -567,7 +576,7 @@ impl RemoteTimelineClient {
    fn schedule_index_upload(
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
-        metadata_bytes: Vec<u8>,
+        metadata: TimelineMetadata,
    ) {
        info!(
            "scheduling metadata upload with {} files ({} changed)",
@@ -580,7 +589,7 @@ impl RemoteTimelineClient {
        let index_part = IndexPart::new(
            upload_queue.latest_files.clone(),
            disk_consistent_lsn,
-            metadata_bytes,
+            metadata,
        );
        let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
        self.calls_unfinished_metric_begin(&op);
@@ -636,7 +645,7 @@ impl RemoteTimelineClient {

        // Deleting layers doesn't affect the values stored in TimelineMetadata,
        // so we don't need update it. Just serialize it.
-        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
+        let metadata = upload_queue.latest_metadata.clone();

        // Update the remote index file, removing the to-be-deleted files from the index,
        // before deleting the actual files.
@@ -646,21 +655,41 @@ impl RemoteTimelineClient {
        // from latest_files, but not yet scheduled for deletion. Use a closure
        // to syntactically forbid ? or bail! calls here.
        let no_bail_here = || {
-            for name in names {
-                upload_queue.latest_files.remove(name);
-                upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
-            }
+            // Decorate our list of names with each name's generation, dropping
+            // makes that are unexpectedly missing from our metadata.
+            let with_generations: Vec<_> = names
+                .iter()
+                .filter_map(|name| {
+                    // Remove from latest_files, learning the file's remote generation in the process
+                    let meta = upload_queue.latest_files.remove(name);
+
+                    if let Some(meta) = meta {
+                        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
+                        Some((name, meta.generation))
+                    } else {
+                        // This can only happen if we forgot to to schedule the file upload
+                        // before scheduling the delete. Log it because it is a rare/strange
+                        // situation, and in case something is misbehaving, we'd like to know which
+                        // layers experienced this.
+                        info!(
+                            "Deleting layer {name} not found in latest_files list, never uploaded?"
+                        );
+                        None
+                    }
+                })
+                .collect();

            if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-                self.schedule_index_upload(upload_queue, metadata_bytes);
+                self.schedule_index_upload(upload_queue, metadata);
            }

            // schedule the actual deletions
-            for name in names {
+            for (name, generation) in with_generations {
                let op = UploadOp::Delete(Delete {
                    file_kind: RemoteOpFileKind::Layer,
                    layer_file_name: name.clone(),
                    scheduled_from_timeline_delete: false,
+                    generation,
                });
                self.calls_unfinished_metric_begin(&op);
                upload_queue.queued_operations.push_back(op);
@@ -754,15 +783,14 @@ impl RemoteTimelineClient {
        pausable_failpoint!("persist_deleted_index_part");

        backoff::retry(
-            || async {
+            || {
                upload::upload_index_part(
-                    self.conf,
                    &self.storage_impl,
                    &self.tenant_id,
                    &self.timeline_id,
+                    self.generation,
                    &index_part_with_deleted_at,
                )
-                .await
            },
            |_e| false,
            1,
@@ -771,6 +799,8 @@ impl RemoteTimelineClient {
            // when executed as part of tenant deletion this happens in the background
            2,
            "persist_index_part_with_deleted_flag",
+            // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
        )
        .await?;

@@ -816,12 +846,14 @@ impl RemoteTimelineClient {
                .reserve(stopped.upload_queue_for_deletion.latest_files.len());

            // schedule the actual deletions
-            for name in stopped.upload_queue_for_deletion.latest_files.keys() {
+            for (name, meta) in &stopped.upload_queue_for_deletion.latest_files {
                let op = UploadOp::Delete(Delete {
                    file_kind: RemoteOpFileKind::Layer,
                    layer_file_name: name.clone(),
                    scheduled_from_timeline_delete: true,
+                    generation: meta.generation,
                });
+
                self.calls_unfinished_metric_begin(&op);
                stopped
                    .upload_queue_for_deletion
@@ -844,8 +876,7 @@ impl RemoteTimelineClient {

        // Do not delete index part yet, it is needed for possible retry. If we remove it first
        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
-        let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
-        let timeline_storage_path = self.conf.remote_path(&timeline_path)?;
+        let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id);

        let remaining = backoff::retry(
            || async {
@@ -857,6 +888,7 @@ impl RemoteTimelineClient {
            FAILED_DOWNLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "list_prefixes",
+            backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
        )
        .await
        .context("list prefixes")?;
@@ -880,6 +912,7 @@ impl RemoteTimelineClient {
                FAILED_UPLOAD_WARN_THRESHOLD,
                FAILED_REMOTE_OP_RETRIES,
                "delete_objects",
+                backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
            )
            .await
            .context("delete_objects")?;
@@ -901,6 +934,7 @@ impl RemoteTimelineClient {
            FAILED_UPLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "delete_index",
+            backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled")),
        )
        .await
        .context("delete_index")?;
@@ -1046,15 +1080,17 @@ impl RemoteTimelineClient {

            let upload_result: anyhow::Result<()> = match &task.op {
                UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => {
-                    let path = &self
+                    let path = self
                        .conf
                        .timeline_path(&self.tenant_id, &self.timeline_id)
                        .join(layer_file_name.file_name());
+
                    upload::upload_timeline_layer(
                        self.conf,
                        &self.storage_impl,
-                        path,
+                        &path,
                        layer_metadata,
+                        self.generation,
                    )
                    .measure_remote_op(
                        self.tenant_id,
@@ -1066,11 +1102,20 @@ impl RemoteTimelineClient {
                    .await
                }
                UploadOp::UploadMetadata(ref index_part, _lsn) => {
+                    let mention_having_future_layers = if cfg!(feature = "testing") {
+                        index_part
+                            .layer_metadata
+                            .keys()
+                            .any(|x| x.is_in_future(*_lsn))
+                    } else {
+                        false
+                    };
+
                    let res = upload::upload_index_part(
-                        self.conf,
                        &self.storage_impl,
                        &self.tenant_id,
                        &self.timeline_id,
+                        self.generation,
                        index_part,
                    )
                    .measure_remote_op(
@@ -1083,6 +1128,10 @@ impl RemoteTimelineClient {
                    .await;
                    if res.is_ok() {
                        self.update_remote_physical_size_gauge(Some(index_part));
+                        if mention_having_future_layers {
+                            // find rationale near crate::tenant::timeline::init::cleanup_future_layer
+                            tracing::info!(disk_consistent_lsn=%_lsn, "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup");
+                        }
                    }
                    res
                }
@@ -1091,7 +1140,7 @@ impl RemoteTimelineClient {
                        .conf
                        .timeline_path(&self.tenant_id, &self.timeline_id)
                        .join(delete.layer_file_name.file_name());
-                    delete::delete_layer(self.conf, &self.storage_impl, path)
+                    delete::delete_layer(self.conf, &self.storage_impl, path, delete.generation)
                        .measure_remote_op(
                            self.tenant_id,
                            self.timeline_id,
@@ -1134,14 +1183,13 @@ impl RemoteTimelineClient {
                    }

                    // sleep until it's time to retry, or we're cancelled
-                    tokio::select! {
-                        _ = task_mgr::shutdown_watcher() => { },
-                        _ = exponential_backoff(
-                            retries,
-                            DEFAULT_BASE_BACKOFF_SECONDS,
-                            DEFAULT_MAX_BACKOFF_SECONDS,
-                        ) => { },
-                    };
+                    exponential_backoff(
+                        retries,
+                        DEFAULT_BASE_BACKOFF_SECONDS,
+                        DEFAULT_MAX_BACKOFF_SECONDS,
+                        &shutdown_token(),
+                    )
+                    .await;
                }
            }
        }
@@ -1339,6 +1387,71 @@ impl RemoteTimelineClient {
    }
 }

+pub fn remote_timelines_path(tenant_id: &TenantId) -> RemotePath {
+    let path = format!("tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}");
+    RemotePath::from_string(&path).expect("Failed to construct path")
+}
+
+pub fn remote_timeline_path(tenant_id: &TenantId, timeline_id: &TimelineId) -> RemotePath {
+    remote_timelines_path(tenant_id).join(&PathBuf::from(timeline_id.to_string()))
+}
+
+pub fn remote_layer_path(
+    tenant_id: &TenantId,
+    timeline_id: &TimelineId,
+    layer_file_name: &LayerFileName,
+    layer_meta: &LayerFileMetadata,
+) -> RemotePath {
+    // Generation-aware key format
+    let path = format!(
+        "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
+        layer_file_name.file_name(),
+        layer_meta.generation.get_suffix()
+    );
+
+    RemotePath::from_string(&path).expect("Failed to construct path")
+}
+
+pub fn remote_index_path(
+    tenant_id: &TenantId,
+    timeline_id: &TimelineId,
+    generation: Generation,
+) -> RemotePath {
+    RemotePath::from_string(&format!(
+        "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
+        IndexPart::FILE_NAME,
+        generation.get_suffix()
+    ))
+    .expect("Failed to construct path")
+}
+
+/// Files on the remote storage are stored with paths, relative to the workdir.
+/// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path.
+///
+/// Errors if the path provided does not start from pageserver's workdir.
+pub fn remote_path(
+    conf: &PageServerConf,
+    local_path: &Path,
+    generation: Generation,
+) -> anyhow::Result<RemotePath> {
+    let stripped = local_path
+        .strip_prefix(&conf.workdir)
+        .context("Failed to strip workdir prefix")?;
+
+    let suffixed = format!(
+        "{0}{1}",
+        stripped.to_string_lossy(),
+        generation.get_suffix()
+    );
+
+    RemotePath::new(&PathBuf::from(suffixed)).with_context(|| {
+        format!(
+            "to resolve remote part of path {:?} for base {:?}",
+            local_path, conf.workdir
+        )
+    })
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -1346,7 +1459,7 @@ mod tests {
        context::RequestContext,
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
-            Tenant, Timeline,
+            Generation, Tenant, Timeline,
        },
        DEFAULT_PG_VERSION,
    };
@@ -1388,8 +1501,11 @@ mod tests {
        assert_eq!(avec, bvec);
    }

-    fn assert_remote_files(expected: &[&str], remote_path: &Path) {
-        let mut expected: Vec<String> = expected.iter().map(|x| String::from(*x)).collect();
+    fn assert_remote_files(expected: &[&str], remote_path: &Path, generation: Generation) {
+        let mut expected: Vec<String> = expected
+            .iter()
+            .map(|x| format!("{}{}", x, generation.get_suffix()))
+            .collect();
        expected.sort();

        let mut found: Vec<String> = Vec::new();
@@ -1440,6 +1556,8 @@ mod tests {
                storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
            };

+            let generation = Generation::new(0xdeadbeef);
+
            let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();

            let client = Arc::new(RemoteTimelineClient {
@@ -1447,6 +1565,7 @@ mod tests {
                runtime: tokio::runtime::Handle::current(),
                tenant_id: harness.tenant_id,
                timeline_id: TIMELINE_ID,
+                generation,
                storage_impl: storage,
                upload_queue: Mutex::new(UploadQueue::Uninitialized),
                metrics: Arc::new(RemoteTimelineClientMetrics::new(
@@ -1505,6 +1624,8 @@ mod tests {
            .init_upload_queue_for_empty_remote(&metadata)
            .unwrap();

+        let generation = Generation::new(0xdeadbeef);
+
        // Create a couple of dummy files,  schedule upload for them
        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
        let layer_file_name_2: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap();
@@ -1524,13 +1645,13 @@ mod tests {
        client
            .schedule_layer_file_upload(
                &layer_file_name_1,
-                &LayerFileMetadata::new(content_1.len() as u64),
+                &LayerFileMetadata::new(content_1.len() as u64, generation),
            )
            .unwrap();
        client
            .schedule_layer_file_upload(
                &layer_file_name_2,
-                &LayerFileMetadata::new(content_2.len() as u64),
+                &LayerFileMetadata::new(content_2.len() as u64, generation),
            )
            .unwrap();

@@ -1588,14 +1709,13 @@ mod tests {
                &layer_file_name_2.file_name(),
            ],
        );
-        let downloaded_metadata = index_part.parse_metadata().unwrap();
-        assert_eq!(downloaded_metadata, metadata);
+        assert_eq!(index_part.metadata, metadata);

        // Schedule upload and then a deletion. Check that the deletion is queued
        client
            .schedule_layer_file_upload(
                &layer_file_name_3,
-                &LayerFileMetadata::new(content_3.len() as u64),
+                &LayerFileMetadata::new(content_3.len() as u64, generation),
            )
            .unwrap();
        client
@@ -1619,6 +1739,7 @@ mod tests {
                "index_part.json",
            ],
            &remote_timeline_dir,
+            generation,
        );

        // Finish them
@@ -1631,6 +1752,7 @@ mod tests {
                "index_part.json",
            ],
            &remote_timeline_dir,
+            generation,
        );
    }

@@ -1683,12 +1805,14 @@ mod tests {

        // Test

+        let generation = Generation::new(0xdeadbeef);
+
        let init = get_bytes_started_stopped();

        client
            .schedule_layer_file_upload(
                &layer_file_name_1,
-                &LayerFileMetadata::new(content_1.len() as u64),
+                &LayerFileMetadata::new(content_1.len() as u64, generation),
            )
            .unwrap();

--- a/pageserver/src/tenant/remote_timeline_client/delete.rs
+++ b/pageserver/src/tenant/remote_timeline_client/delete.rs
@@ -5,25 +5,30 @@ use tracing::debug;

 use remote_storage::GenericRemoteStorage;

-use crate::config::PageServerConf;
+use crate::{
+    config::PageServerConf,
+    tenant::{remote_timeline_client::remote_path, Generation},
+};

 pub(super) async fn delete_layer<'a>(
    conf: &'static PageServerConf,
    storage: &'a GenericRemoteStorage,
    local_layer_path: &'a Path,
+    generation: Generation,
 ) -> anyhow::Result<()> {
    fail::fail_point!("before-delete-layer", |_| {
        anyhow::bail!("failpoint before-delete-layer")
    });
    debug!("Deleting layer from remote storage: {local_layer_path:?}",);

-    let path_to_delete = conf.remote_path(local_layer_path)?;
+    let path_to_delete = remote_path(conf, local_layer_path, generation)?;

    // We don't want to print an error if the delete failed if the file has
    // already been deleted. Thankfully, in this situation S3 already
    // does not yield an error. While OS-provided local file system APIs do yield
    // errors, we avoid them in the `LocalFs` wrapper.
-    storage.delete(&path_to_delete).await.with_context(|| {
-        format!("Failed to delete remote layer from storage at {path_to_delete:?}")
-    })
+    storage
+        .delete(&path_to_delete)
+        .await
+        .with_context(|| format!("delete remote layer from storage at {path_to_delete:?}"))
 }
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -11,17 +11,20 @@ use std::time::Duration;
 use anyhow::{anyhow, Context};
 use tokio::fs;
 use tokio::io::AsyncWriteExt;
+use tokio_util::sync::CancellationToken;
 use utils::{backoff, crashsafe};

 use crate::config::PageServerConf;
+use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::tenant::Generation;
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};

 use super::index::{IndexPart, LayerFileMetadata};
-use super::{FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES};
+use super::{remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES};

 static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);

@@ -40,13 +43,11 @@ pub async fn download_layer_file<'a>(
 ) -> Result<u64, DownloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

-    let timeline_path = conf.timeline_path(&tenant_id, &timeline_id);
+    let local_path = conf
+        .timeline_path(&tenant_id, &timeline_id)
+        .join(layer_file_name.file_name());

-    let local_path = timeline_path.join(layer_file_name.file_name());
-
-    let remote_path = conf
-        .remote_path(&local_path)
-        .map_err(DownloadError::Other)?;
+    let remote_path = remote_layer_path(&tenant_id, &timeline_id, layer_file_name, layer_metadata);

    // Perform a rename inspired by durable_rename from file_utils.c.
    // The sequence:
@@ -63,33 +64,43 @@ pub async fn download_layer_file<'a>(
    let (mut destination_file, bytes_amount) = download_retry(
        || async {
            // TODO: this doesn't use the cached fd for some reason?
-            let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
-                format!(
-                    "create a destination file for layer '{}'",
-                    temp_file_path.display()
-                )
-            })
-            .map_err(DownloadError::Other)?;
-            let mut download = storage.download(&remote_path).await.with_context(|| {
-                format!(
+            let mut destination_file = fs::File::create(&temp_file_path)
+                .await
+                .with_context(|| {
+                    format!(
+                        "create a destination file for layer '{}'",
+                        temp_file_path.display()
+                    )
+                })
+                .map_err(DownloadError::Other)?;
+            let mut download = storage
+                .download(&remote_path)
+                .await
+                .with_context(|| {
+                    format!(
                    "open a download stream for layer with remote storage path '{remote_path:?}'"
                )
-            })
-            .map_err(DownloadError::Other)?;
-
-            let bytes_amount = tokio::time::timeout(MAX_DOWNLOAD_DURATION, tokio::io::copy(&mut download.download_stream, &mut destination_file))
-                .await
-                .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out  {:?}", e)))?
-                .with_context(|| {
-                    format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
                })
                .map_err(DownloadError::Other)?;

-            Ok((destination_file, bytes_amount))
+            let bytes_amount = tokio::time::timeout(
+                MAX_DOWNLOAD_DURATION,
+                tokio::io::copy(&mut download.download_stream, &mut destination_file),
+            )
+            .await
+            .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out  {:?}", e)))?
+            .with_context(|| {
+                format!(
+                    "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
+                )
+            })
+            .map_err(DownloadError::Other)?;

+            Ok((destination_file, bytes_amount))
        },
        &format!("download {remote_path:?}"),
-    ).await?;
+    )
+    .await?;

    // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
    // A file will not be closed immediately when it goes out of scope if there are any IO operations
@@ -102,12 +113,7 @@ pub async fn download_layer_file<'a>(
    destination_file
        .flush()
        .await
-        .with_context(|| {
-            format!(
-                "failed to flush source file at {}",
-                temp_file_path.display()
-            )
-        })
+        .with_context(|| format!("flush source file at {}", temp_file_path.display()))
        .map_err(DownloadError::Other)?;

    let expected = layer_metadata.file_size();
@@ -138,17 +144,12 @@ pub async fn download_layer_file<'a>(

    fs::rename(&temp_file_path, &local_path)
        .await
-        .with_context(|| {
-            format!(
-                "Could not rename download layer file to {}",
-                local_path.display(),
-            )
-        })
+        .with_context(|| format!("rename download layer file to {}", local_path.display(),))
        .map_err(DownloadError::Other)?;

    crashsafe::fsync_async(&local_path)
        .await
-        .with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
+        .with_context(|| format!("fsync layer file {}", local_path.display(),))
        .map_err(DownloadError::Other)?;

    tracing::debug!("download complete: {}", local_path.display());
@@ -172,21 +173,19 @@ pub fn is_temp_download_file(path: &Path) -> bool {
 }

 /// List timelines of given tenant in remote storage
-pub async fn list_remote_timelines<'a>(
-    storage: &'a GenericRemoteStorage,
-    conf: &'static PageServerConf,
+pub async fn list_remote_timelines(
+    storage: &GenericRemoteStorage,
    tenant_id: TenantId,
 ) -> anyhow::Result<HashSet<TimelineId>> {
-    let tenant_path = conf.timelines_path(&tenant_id);
-    let tenant_storage_path = conf.remote_path(&tenant_path)?;
+    let remote_path = remote_timelines_path(&tenant_id);

    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
        anyhow::bail!("storage-sync-list-remote-timelines");
    });

    let timelines = download_retry(
-        || storage.list_prefixes(Some(&tenant_storage_path)),
-        &format!("list prefixes for {tenant_path:?}"),
+        || storage.list_prefixes(Some(&remote_path)),
+        &format!("list prefixes for {tenant_id}"),
    )
    .await?;

@@ -201,9 +200,9 @@ pub async fn list_remote_timelines<'a>(
            anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
        })?;

-        let timeline_id: TimelineId = object_name.parse().with_context(|| {
-            format!("failed to parse object name into timeline id '{object_name}'")
-        })?;
+        let timeline_id: TimelineId = object_name
+            .parse()
+            .with_context(|| format!("parse object name into timeline id '{object_name}'"))?;

        // list_prefixes is assumed to return unique names. Ensure this here.
        // NB: it's safer to bail out than warn-log this because the pageserver
@@ -221,21 +220,16 @@ pub async fn list_remote_timelines<'a>(
 }

 pub(super) async fn download_index_part(
-    conf: &'static PageServerConf,
    storage: &GenericRemoteStorage,
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
+    generation: Generation,
 ) -> Result<IndexPart, DownloadError> {
-    let index_part_path = conf
-        .metadata_path(tenant_id, timeline_id)
-        .with_file_name(IndexPart::FILE_NAME);
-    let part_storage_path = conf
-        .remote_path(&index_part_path)
-        .map_err(DownloadError::BadInput)?;
+    let remote_path = remote_index_path(tenant_id, timeline_id, generation);

    let index_part_bytes = download_retry(
        || async {
-            let mut index_part_download = storage.download(&part_storage_path).await?;
+            let mut index_part_download = storage.download(&remote_path).await?;

            let mut index_part_bytes = Vec::new();
            tokio::io::copy(
@@ -243,20 +237,16 @@ pub(super) async fn download_index_part(
                &mut index_part_bytes,
            )
            .await
-            .with_context(|| {
-                format!("Failed to download an index part into file {index_part_path:?}")
-            })
+            .with_context(|| format!("download index part at {remote_path:?}"))
            .map_err(DownloadError::Other)?;
            Ok(index_part_bytes)
        },
-        &format!("download {part_storage_path:?}"),
+        &format!("download {remote_path:?}"),
    )
    .await?;

    let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
-        .with_context(|| {
-            format!("Failed to deserialize index part file into file {index_part_path:?}")
-        })
+        .with_context(|| format!("download index part file at {remote_path:?}"))
        .map_err(DownloadError::Other)?;

    Ok(index_part)
@@ -280,6 +270,10 @@ where
        FAILED_DOWNLOAD_WARN_THRESHOLD,
        FAILED_REMOTE_OP_RETRIES,
        description,
+        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+        backoff::Cancel::new(CancellationToken::new(), || -> DownloadError {
+            unreachable!()
+        }),
    )
    .await
 }
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -2,7 +2,7 @@
 //! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
 //! remote timeline layers and its metadata.

-use std::collections::{HashMap, HashSet};
+use std::collections::HashMap;

 use chrono::NaiveDateTime;
 use serde::{Deserialize, Serialize};
@@ -12,6 +12,7 @@ use utils::bin_ser::SerializeError;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::upload_queue::UploadQueueInitialized;
+use crate::tenant::Generation;

 use utils::lsn::Lsn;

@@ -20,22 +21,28 @@ use utils::lsn::Lsn;
 /// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
 /// might have less or more metadata depending if upgrading or rolling back an upgrade.
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
-#[cfg_attr(test, derive(Default))]
+//#[cfg_attr(test, derive(Default))]
 pub struct LayerFileMetadata {
    file_size: u64,
+
+    pub(crate) generation: Generation,
 }

 impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
    fn from(other: &IndexLayerMetadata) -> Self {
        LayerFileMetadata {
            file_size: other.file_size,
+            generation: other.generation,
        }
    }
 }

 impl LayerFileMetadata {
-    pub fn new(file_size: u64) -> Self {
-        LayerFileMetadata { file_size }
+    pub fn new(file_size: u64, generation: Generation) -> Self {
+        LayerFileMetadata {
+            file_size,
+            generation,
+        }
    }

    pub fn file_size(&self) -> u64 {
@@ -62,10 +69,6 @@ pub struct IndexPart {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub deleted_at: Option<NaiveDateTime>,

-    /// Legacy field: equal to the keys of `layer_metadata`, only written out for forward compat
-    #[serde(default, skip_deserializing)]
-    timeline_layers: HashSet<LayerFileName>,
-
    /// Per layer file name metadata, which can be present for a present or missing layer file.
    ///
    /// Older versions of `IndexPart` will not have this property or have only a part of metadata
@@ -77,7 +80,9 @@ pub struct IndexPart {
    // private because internally we would read from metadata instead.
    #[serde_as(as = "DisplayFromStr")]
    disk_consistent_lsn: Lsn,
-    metadata_bytes: Vec<u8>,
+
+    #[serde(rename = "metadata_bytes")]
+    pub metadata: TimelineMetadata,
 }

 impl IndexPart {
@@ -89,36 +94,29 @@ impl IndexPart {
    /// - 2: added `deleted_at`
    /// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers
    ///      is always generated from the keys of `layer_metadata`)
-    const LATEST_VERSION: usize = 3;
+    /// - 4: timeline_layers is fully removed.
+    const LATEST_VERSION: usize = 4;
    pub const FILE_NAME: &'static str = "index_part.json";

    pub fn new(
        layers_and_metadata: HashMap<LayerFileName, LayerFileMetadata>,
        disk_consistent_lsn: Lsn,
-        metadata_bytes: Vec<u8>,
+        metadata: TimelineMetadata,
    ) -> Self {
-        let mut timeline_layers = HashSet::with_capacity(layers_and_metadata.len());
-        let mut layer_metadata = HashMap::with_capacity(layers_and_metadata.len());
-
-        for (remote_name, metadata) in &layers_and_metadata {
-            timeline_layers.insert(remote_name.to_owned());
-            let metadata = IndexLayerMetadata::from(metadata);
-            layer_metadata.insert(remote_name.to_owned(), metadata);
-        }
+        // Transform LayerFileMetadata into IndexLayerMetadata
+        let layer_metadata = layers_and_metadata
+            .into_iter()
+            .map(|(k, v)| (k, IndexLayerMetadata::from(v)))
+            .collect();

        Self {
            version: Self::LATEST_VERSION,
-            timeline_layers,
            layer_metadata,
            disk_consistent_lsn,
-            metadata_bytes,
+            metadata,
            deleted_at: None,
        }
    }
-
-    pub fn parse_metadata(&self) -> anyhow::Result<TimelineMetadata> {
-        TimelineMetadata::from_bytes(&self.metadata_bytes)
-    }
 }

 impl TryFrom<&UploadQueueInitialized> for IndexPart {
@@ -126,26 +124,31 @@ impl TryFrom<&UploadQueueInitialized> for IndexPart {

    fn try_from(upload_queue: &UploadQueueInitialized) -> Result<Self, Self::Error> {
        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
-        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
+        let metadata = upload_queue.latest_metadata.clone();

        Ok(Self::new(
            upload_queue.latest_files.clone(),
            disk_consistent_lsn,
-            metadata_bytes,
+            metadata,
        ))
    }
 }

 /// Serialized form of [`LayerFileMetadata`].
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub struct IndexLayerMetadata {
    pub(super) file_size: u64,
+
+    #[serde(default = "Generation::none")]
+    #[serde(skip_serializing_if = "Generation::is_none")]
+    pub(super) generation: Generation,
 }

-impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
-    fn from(other: &'_ LayerFileMetadata) -> Self {
+impl From<LayerFileMetadata> for IndexLayerMetadata {
+    fn from(other: LayerFileMetadata) -> Self {
        IndexLayerMetadata {
            file_size: other.file_size,
+            generation: other.generation,
        }
    }
 }
@@ -170,19 +173,20 @@ mod tests {
        let expected = IndexPart {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 1,
-            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
+                    generation: Generation::none()
                }),
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
+                    generation: Generation::none()
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: None,
        };

@@ -201,25 +205,26 @@ mod tests {
                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
            },
            "disk_consistent_lsn":"0/16960E8",
-            "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
        }"#;

        let expected = IndexPart {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 1,
-            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
+                    generation: Generation::none()
                }),
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
+                    generation: Generation::none()
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: None,
        };

@@ -238,26 +243,27 @@ mod tests {
                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
            },
            "disk_consistent_lsn":"0/16960E8",
-            "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
            "deleted_at": "2023-07-31T09:00:00.123"
        }"#;

        let expected = IndexPart {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 2,
-            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
+                    generation: Generation::none()
                }),
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
+                    generation: Generation::none()
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
        };
@@ -278,10 +284,9 @@ mod tests {

        let expected = IndexPart {
            version: 1,
-            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::new(),
            disk_consistent_lsn: "0/2532648".parse::<Lsn>().unwrap(),
-            metadata_bytes: [
+            metadata: TimelineMetadata::from_bytes(&[
                136, 151, 49, 208, 0, 70, 0, 4, 0, 0, 0, 0, 2, 83, 38, 72, 1, 0, 0, 0, 0, 2, 83,
                38, 32, 1, 87, 198, 240, 135, 97, 119, 45, 125, 38, 29, 155, 161, 140, 141, 255,
                210, 0, 0, 0, 0, 2, 83, 38, 72, 0, 0, 0, 0, 1, 73, 240, 192, 0, 0, 0, 0, 1, 73,
@@ -302,8 +307,8 @@ mod tests {
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0,
-            ]
-            .to_vec(),
+            ])
+            .unwrap(),
            deleted_at: None,
        };

@@ -311,4 +316,41 @@ mod tests {

        assert_eq!(empty_layers_parsed, expected);
    }
+
+    #[test]
+    fn v4_indexpart_is_parsed() {
+        let example = r#"{
+            "version":4,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+            "deleted_at": "2023-07-31T09:00:00.123"
+        }"#;
+
+        let expected = IndexPart {
+            version: 4,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                    file_size: 25600000,
+                    generation: Generation::none()
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                    // serde_json should always parse this but this might be a double with jq for
+                    // example.
+                    file_size: 9007199254741001,
+                    generation: Generation::none()
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
+            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
+                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
+        };
+
+        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        assert_eq!(part, expected);
+    }
 }
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -5,7 +5,11 @@ use fail::fail_point;
 use std::{io::ErrorKind, path::Path};
 use tokio::fs;

-use crate::{config::PageServerConf, tenant::remote_timeline_client::index::IndexPart};
+use super::Generation;
+use crate::{
+    config::PageServerConf,
+    tenant::remote_timeline_client::{index::IndexPart, remote_index_path, remote_path},
+};
 use remote_storage::GenericRemoteStorage;
 use utils::id::{TenantId, TimelineId};

@@ -15,10 +19,10 @@ use tracing::info;

 /// Serializes and uploads the given index part data to the remote storage.
 pub(super) async fn upload_index_part<'a>(
-    conf: &'static PageServerConf,
    storage: &'a GenericRemoteStorage,
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
+    generation: Generation,
    index_part: &'a IndexPart,
 ) -> anyhow::Result<()> {
    tracing::trace!("uploading new index part");
@@ -27,20 +31,16 @@ pub(super) async fn upload_index_part<'a>(
        bail!("failpoint before-upload-index")
    });

-    let index_part_bytes = serde_json::to_vec(&index_part)
-        .context("Failed to serialize index part file into bytes")?;
+    let index_part_bytes =
+        serde_json::to_vec(&index_part).context("serialize index part file into bytes")?;
    let index_part_size = index_part_bytes.len();
    let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));

-    let index_part_path = conf
-        .metadata_path(tenant_id, timeline_id)
-        .with_file_name(IndexPart::FILE_NAME);
-    let storage_path = conf.remote_path(&index_part_path)?;
-
+    let remote_path = remote_index_path(tenant_id, timeline_id, generation);
    storage
-        .upload_storage_object(Box::new(index_part_bytes), index_part_size, &storage_path)
+        .upload_storage_object(Box::new(index_part_bytes), index_part_size, &remote_path)
        .await
-        .with_context(|| format!("Failed to upload index part for '{tenant_id} / {timeline_id}'"))
+        .with_context(|| format!("upload index part for '{tenant_id} / {timeline_id}'"))
 }

 /// Attempts to upload given layer files.
@@ -52,12 +52,13 @@ pub(super) async fn upload_timeline_layer<'a>(
    storage: &'a GenericRemoteStorage,
    source_path: &'a Path,
    known_metadata: &'a LayerFileMetadata,
+    generation: Generation,
 ) -> anyhow::Result<()> {
    fail_point!("before-upload-layer", |_| {
        bail!("failpoint before-upload-layer")
    });
-    let storage_path = conf.remote_path(source_path)?;

+    let storage_path = remote_path(conf, source_path, generation)?;
    let source_file_res = fs::File::open(&source_path).await;
    let source_file = match source_file_res {
        Ok(source_file) => source_file,
@@ -70,16 +71,15 @@ pub(super) async fn upload_timeline_layer<'a>(
            info!(path = %source_path.display(), "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
            return Ok(());
        }
-        Err(e) => Err(e)
-            .with_context(|| format!("Failed to open a source file for layer {source_path:?}"))?,
+        Err(e) => {
+            Err(e).with_context(|| format!("open a source file for layer {source_path:?}"))?
+        }
    };

    let fs_size = source_file
        .metadata()
        .await
-        .with_context(|| {
-            format!("Failed to get the source file metadata for layer {source_path:?}")
-        })?
+        .with_context(|| format!("get the source file metadata for layer {source_path:?}"))?
        .len();

    let metadata_size = known_metadata.file_size();
@@ -87,19 +87,13 @@ pub(super) async fn upload_timeline_layer<'a>(
        bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
    }

-    let fs_size = usize::try_from(fs_size).with_context(|| {
-        format!("File {source_path:?} size {fs_size} could not be converted to usize")
-    })?;
+    let fs_size = usize::try_from(fs_size)
+        .with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?;

    storage
        .upload(source_file, fs_size, &storage_path, None)
        .await
-        .with_context(|| {
-            format!(
-                "Failed to upload a layer from local path '{}'",
-                source_path.display()
-            )
-        })?;
+        .with_context(|| format!("upload layer from local path '{}'", source_path.display()))?;

    Ok(())
 }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -41,8 +41,6 @@ pub use inmemory_layer::InMemoryLayer;
 pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
 pub use remote_layer::RemoteLayer;

-use super::timeline::layer_manager::LayerManager;
-
 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
    T: PartialOrd<T>,
@@ -175,16 +173,9 @@ impl LayerAccessStats {
    ///
    /// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad
    /// [`record_residence_event`]: Self::record_residence_event
-    pub(crate) fn for_loading_layer(
-        layer_map_lock_held_witness: &LayerManager,
-        status: LayerResidenceStatus,
-    ) -> Self {
+    pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
        let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
-        new.record_residence_event(
-            layer_map_lock_held_witness,
-            status,
-            LayerResidenceEventReason::LayerLoad,
-        );
+        new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
        new
    }

@@ -197,7 +188,6 @@ impl LayerAccessStats {
    /// [`record_residence_event`]: Self::record_residence_event
    pub(crate) fn clone_for_residence_change(
        &self,
-        layer_map_lock_held_witness: &LayerManager,
        new_status: LayerResidenceStatus,
    ) -> LayerAccessStats {
        let clone = {
@@ -205,11 +195,7 @@ impl LayerAccessStats {
            inner.clone()
        };
        let new = LayerAccessStats(Mutex::new(clone));
-        new.record_residence_event(
-            layer_map_lock_held_witness,
-            new_status,
-            LayerResidenceEventReason::ResidenceChange,
-        );
+        new.record_residence_event(new_status, LayerResidenceEventReason::ResidenceChange);
        new
    }

@@ -229,7 +215,6 @@ impl LayerAccessStats {
    ///
    pub(crate) fn record_residence_event(
        &self,
-        _layer_map_lock_held_witness: &LayerManager,
        status: LayerResidenceStatus,
        reason: LayerResidenceEventReason,
    ) {
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -318,30 +318,28 @@ impl DeltaLayer {

        tree_reader.dump().await?;

-        let keys = DeltaLayerInner::load_keys(&Ref(&**inner)).await?;
+        let keys = DeltaLayerInner::load_keys(&inner).await?;

        // A subroutine to dump a single blob
-        let dump_blob = |val: ValueRef<_>| -> _ {
-            async move {
-                let buf = val.reader.read_blob(val.blob_ref.pos()).await?;
-                let val = Value::des(&buf)?;
-                let desc = match val {
-                    Value::Image(img) => {
-                        format!(" img {} bytes", img.len())
-                    }
-                    Value::WalRecord(rec) => {
-                        let wal_desc = walrecord::describe_wal_record(&rec)?;
-                        format!(
-                            " rec {} bytes will_init: {} {}",
-                            buf.len(),
-                            rec.will_init(),
-                            wal_desc
-                        )
-                    }
-                };
-                Ok(desc)
-            }
-        };
+        async fn dump_blob(val: ValueRef<'_>) -> Result<String> {
+            let buf = val.reader.read_blob(val.blob_ref.pos()).await?;
+            let val = Value::des(&buf)?;
+            let desc = match val {
+                Value::Image(img) => {
+                    format!(" img {} bytes", img.len())
+                }
+                Value::WalRecord(rec) => {
+                    let wal_desc = walrecord::describe_wal_record(&rec)?;
+                    format!(
+                        " rec {} bytes will_init: {} {}",
+                        buf.len(),
+                        rec.will_init(),
+                        wal_desc
+                    )
+                }
+            };
+            Ok(desc)
+        }

        for entry in keys {
            let DeltaEntry { key, lsn, val, .. } = entry;
@@ -469,7 +467,7 @@ impl DeltaLayer {
            PathOrConf::Path(_) => None,
        };

-        let loaded = DeltaLayerInner::load(&path, summary)?;
+        let loaded = DeltaLayerInner::load(&path, summary).await?;

        if let PathOrConf::Path(ref path) = self.path_or_conf {
            // not production code
@@ -552,17 +550,12 @@ impl DeltaLayer {
    /// Loads all keys stored in the layer. Returns key, lsn, value size and value reference.
    ///
    /// The value can be obtained via the [`ValueRef::load`] function.
-    pub(crate) async fn load_keys(
-        &self,
-        ctx: &RequestContext,
-    ) -> Result<Vec<DeltaEntry<Ref<&'_ DeltaLayerInner>>>> {
+    pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<DeltaEntry<'_>>> {
        let inner = self
            .load(LayerAccessKind::KeyIter, ctx)
            .await
            .context("load delta layer keys")?;
-
-        let inner = Ref(&**inner);
-        DeltaLayerInner::load_keys(&inner)
+        DeltaLayerInner::load_keys(inner)
            .await
            .context("Layer index is corrupted")
    }
@@ -848,12 +841,15 @@ impl Drop for DeltaLayerWriter {
 }

 impl DeltaLayerInner {
-    pub(super) fn load(path: &std::path::Path, summary: Option<Summary>) -> anyhow::Result<Self> {
+    pub(super) async fn load(
+        path: &std::path::Path,
+        summary: Option<Summary>,
+    ) -> anyhow::Result<Self> {
        let file = VirtualFile::open(path)
            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
        let file = FileBlockReader::new(file);

-        let summary_blk = file.read_blk(0)?;
+        let summary_blk = file.read_blk(0).await?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

        if let Some(mut expected_summary) = summary {
@@ -958,14 +954,14 @@ impl DeltaLayerInner {

    pub(super) async fn load_keys<T: AsRef<DeltaLayerInner> + Clone>(
        this: &T,
-    ) -> Result<Vec<DeltaEntry<T>>> {
+    ) -> Result<Vec<DeltaEntry<'_>>> {
        let dl = this.as_ref();
        let file = &dl.file;

        let tree_reader =
            DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);

-        let mut all_keys: Vec<DeltaEntry<T>> = Vec::new();
+        let mut all_keys: Vec<DeltaEntry<'_>> = Vec::new();

        tree_reader
            .visit(
@@ -975,7 +971,9 @@ impl DeltaLayerInner {
                    let delta_key = DeltaKey::from_slice(key);
                    let val_ref = ValueRef {
                        blob_ref: BlobRef(value),
-                        reader: BlockCursor::new(Adapter(this.clone())),
+                        reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
+                            Adapter(dl),
+                        )),
                    };
                    let pos = BlobRef(value).pos();
                    if let Some(last) = all_keys.last_mut() {
@@ -1004,43 +1002,23 @@ impl DeltaLayerInner {
    }
 }

-/// Cloneable borrow wrapper to make borrows behave like smart pointers.
-///
-/// Shared references are trivially copyable. This wrapper avoids (confusion) to otherwise attempt
-/// cloning DeltaLayerInner.
-pub(crate) struct Ref<T>(T);
-
-impl<'a, T> AsRef<T> for Ref<&'a T> {
-    fn as_ref(&self) -> &T {
-        self.0
-    }
-}
-
-impl<'a, T> Clone for Ref<&'a T> {
-    fn clone(&self) -> Self {
-        *self
-    }
-}
-
-impl<'a, T> Copy for Ref<&'a T> {}
-
 /// A set of data associated with a delta layer key and its value
-pub struct DeltaEntry<T: AsRef<DeltaLayerInner>> {
+pub struct DeltaEntry<'a> {
    pub key: Key,
    pub lsn: Lsn,
    /// Size of the stored value
    pub size: u64,
    /// Reference to the on-disk value
-    pub val: ValueRef<T>,
+    pub val: ValueRef<'a>,
 }

 /// Reference to an on-disk value
-pub struct ValueRef<T: AsRef<DeltaLayerInner>> {
+pub struct ValueRef<'a> {
    blob_ref: BlobRef,
-    reader: BlockCursor<Adapter<T>>,
+    reader: BlockCursor<'a>,
 }

-impl<T: AsRef<DeltaLayerInner>> ValueRef<T> {
+impl<'a> ValueRef<'a> {
    /// Loads the value from disk
    pub async fn load(&self) -> Result<Value> {
        // theoretically we *could* record an access time for each, but it does not really matter
@@ -1050,10 +1028,10 @@ impl<T: AsRef<DeltaLayerInner>> ValueRef<T> {
    }
 }

-struct Adapter<T: AsRef<DeltaLayerInner>>(T);
+pub(crate) struct Adapter<T>(T);

-impl<T: AsRef<DeltaLayerInner>> BlockReader for Adapter<T> {
-    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
-        self.0.as_ref().file.read_blk(blknum)
+impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
+    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+        self.0.as_ref().file.read_blk(blknum).await
    }
 }
--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
@@ -212,9 +212,20 @@ pub enum LayerFileName {
 }

 impl LayerFileName {
-    pub fn file_name(&self) -> String {
+    pub(crate) fn file_name(&self) -> String {
        self.to_string()
    }
+
+    /// Determines if this layer file is considered to be in future meaning we will discard these
+    /// layers during timeline initialization from the given disk_consistent_lsn.
+    pub(crate) fn is_in_future(&self, disk_consistent_lsn: Lsn) -> bool {
+        use LayerFileName::*;
+        match self {
+            Image(file_name) if file_name.lsn > disk_consistent_lsn => true,
+            Delta(file_name) if file_name.lsn_range.end > disk_consistent_lsn + 1 => true,
+            _ => false,
+        }
+    }
 }

 impl fmt::Display for LayerFileName {
@@ -263,8 +274,8 @@ impl serde::Serialize for LayerFileName {
        S: serde::Serializer,
    {
        match self {
-            Self::Image(fname) => serializer.serialize_str(&fname.to_string()),
-            Self::Delta(fname) => serializer.serialize_str(&fname.to_string()),
+            Self::Image(fname) => serializer.collect_str(fname),
+            Self::Delta(fname) => serializer.collect_str(fname),
        }
    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -349,7 +349,8 @@ impl ImageLayer {
            PathOrConf::Path(_) => None,
        };

-        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary)?;
+        let loaded =
+            ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary).await?;

        if let PathOrConf::Path(ref path) = self.path_or_conf {
            // not production code
@@ -432,7 +433,7 @@ impl ImageLayer {
 }

 impl ImageLayerInner {
-    pub(super) fn load(
+    pub(super) async fn load(
        path: &std::path::Path,
        lsn: Lsn,
        summary: Option<Summary>,
@@ -440,7 +441,7 @@ impl ImageLayerInner {
        let file = VirtualFile::open(path)
            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
        let file = FileBlockReader::new(file);
-        let summary_blk = file.read_blk(0)?;
+        let summary_blk = file.read_blk(0).await?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

        if let Some(mut expected_summary) = summary {
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -7,14 +7,12 @@
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::repository::{Key, Value};
-use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
 use crate::walrecord;
 use anyhow::{ensure, Result};
 use pageserver_api::models::InMemoryLayerInfo;
-use std::cell::RefCell;
 use std::collections::HashMap;
 use std::sync::OnceLock;
 use tracing::*;
@@ -32,12 +30,6 @@ use tokio::sync::RwLock;

 use super::{DeltaLayer, DeltaLayerWriter, Layer};

-thread_local! {
-    /// A buffer for serializing object during [`InMemoryLayer::put_value`].
-    /// This buffer is reused for each serialization to avoid additional malloc calls.
-    static SER_BUFFER: RefCell<Vec<u8>> = RefCell::new(Vec::new());
-}
-
 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
    tenant_id: TenantId,
@@ -238,7 +230,7 @@ impl InMemoryLayer {
    ///
    pub async fn size(&self) -> Result<u64> {
        let inner = self.inner.read().await;
-        Ok(inner.file.size())
+        Ok(inner.file.len())
    }

    ///
@@ -273,17 +265,17 @@ impl InMemoryLayer {
    /// Adds the page version to the in-memory tree
    pub async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
-        let mut inner = self.inner.write().await;
+        let inner: &mut _ = &mut *self.inner.write().await;
        self.assert_writable();

        let off = {
-            SER_BUFFER.with(|x| -> Result<_> {
-                let mut buf = x.borrow_mut();
-                buf.clear();
-                val.ser_into(&mut (*buf))?;
-                let off = inner.file.write_blob(&buf)?;
-                Ok(off)
-            })?
+            // Avoid doing allocations for "small" values.
+            // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
+            // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
+            let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
+            buf.clear();
+            val.ser_into(&mut buf)?;
+            inner.file.write_blob(&buf).await?
        };

        let vec_map = inner.index.entry(key).or_default();
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -185,7 +185,7 @@ impl RemoteLayer {
    /// Create a Layer struct representing this layer, after it has been downloaded.
    pub(crate) fn create_downloaded_layer(
        &self,
-        layer_map_lock_held_witness: &LayerManager,
+        _layer_map_lock_held_witness: &LayerManager,
        conf: &'static PageServerConf,
        file_size: u64,
    ) -> Arc<dyn PersistentLayer> {
@@ -197,10 +197,8 @@ impl RemoteLayer {
                self.desc.tenant_id,
                &fname,
                file_size,
-                self.access_stats.clone_for_residence_change(
-                    layer_map_lock_held_witness,
-                    LayerResidenceStatus::Resident,
-                ),
+                self.access_stats
+                    .clone_for_residence_change(LayerResidenceStatus::Resident),
            ))
        } else {
            let fname = self.desc.image_file_name();
@@ -210,10 +208,8 @@ impl RemoteLayer {
                self.desc.tenant_id,
                &fname,
                file_size,
-                self.access_stats.clone_for_residence_change(
-                    layer_map_lock_held_witness,
-                    LayerResidenceStatus::Resident,
-                ),
+                self.access_stats
+                    .clone_for_residence_change(LayerResidenceStatus::Resident),
            ))
        }
    }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,5 +1,6 @@
 pub mod delete;
 mod eviction_task;
+mod init;
 pub mod layer_manager;
 mod logical_size;
 pub mod span;
@@ -27,7 +28,6 @@ use utils::id::TenantTimelineId;

 use std::cmp::{max, min, Ordering};
 use std::collections::{BinaryHeap, HashMap, HashSet};
-use std::fs;
 use std::ops::{Deref, Range};
 use std::path::{Path, PathBuf};
 use std::pin::pin;
@@ -38,15 +38,13 @@ use std::time::{Duration, Instant, SystemTime};
 use crate::context::{
    AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
 };
-use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
+use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
-    DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer,
-    LayerAccessStats, LayerFileName, RemoteLayer,
+    DeltaLayerWriter, ImageLayerWriter, InMemoryLayer, LayerAccessStats, LayerFileName, RemoteLayer,
 };
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
-    ephemeral_file::is_ephemeral_file,
    layer_map::{LayerMap, SearchResult},
    metadata::{save_metadata, TimelineMetadata},
    par_fsync,
@@ -69,6 +67,7 @@ use postgres_connection::PgConnectionConfig;
 use postgres_ffi::to_pg_timestamp;
 use utils::{
    completion,
+    generation::Generation,
    id::{TenantId, TimelineId},
    lsn::{AtomicLsn, Lsn, RecordLsn},
    seqwait::SeqWait,
@@ -78,11 +77,10 @@ use utils::{
 use crate::page_cache;
 use crate::repository::GcResult;
 use crate::repository::{Key, Value};
+use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::walredo::WalRedoManager;
-use crate::METADATA_FILE_NAME;
 use crate::ZERO_PAGE;
-use crate::{is_temporary, task_mgr};

 use self::delete::DeleteTimelineFlow;
 pub(super) use self::eviction_task::EvictionTaskTenantState;
@@ -155,6 +153,10 @@ pub struct Timeline {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,

+    /// The generation of the tenant that instantiated us: this is used for safety when writing remote objects.
+    /// Never changes for the lifetime of this [`Timeline`] object.
+    generation: Generation,
+
    pub pg_version: u32,

    /// The tuple has two elements.
@@ -468,7 +470,7 @@ impl Timeline {
        // The cached image can be returned directly if there is no WAL between the cached image
        // and requested LSN. The cached image can also be used to reduce the amount of WAL needed
        // for redo.
-        let cached_page_img = match self.lookup_cached_page(&key, lsn) {
+        let cached_page_img = match self.lookup_cached_page(&key, lsn).await {
            Some((cached_lsn, cached_img)) => {
                match cached_lsn.cmp(&lsn) {
                    Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
@@ -497,6 +499,7 @@ impl Timeline {

        RECONSTRUCT_TIME
            .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state))
+            .await
    }

    /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
@@ -1201,7 +1204,7 @@ impl Timeline {
                Ok(delta) => Some(delta),
            };

-        let layer_metadata = LayerFileMetadata::new(layer_file_size);
+        let layer_metadata = LayerFileMetadata::new(layer_file_size, self.generation);

        let new_remote_layer = Arc::new(match local_layer.filename() {
            LayerFileName::Image(image_name) => RemoteLayer::new_img(
@@ -1211,7 +1214,7 @@ impl Timeline {
                &layer_metadata,
                local_layer
                    .access_stats()
-                    .clone_for_residence_change(layer_mgr, LayerResidenceStatus::Evicted),
+                    .clone_for_residence_change(LayerResidenceStatus::Evicted),
            ),
            LayerFileName::Delta(delta_name) => RemoteLayer::new_delta(
                self.tenant_id,
@@ -1220,7 +1223,7 @@ impl Timeline {
                &layer_metadata,
                local_layer
                    .access_stats()
-                    .clone_for_residence_change(layer_mgr, LayerResidenceStatus::Evicted),
+                    .clone_for_residence_change(LayerResidenceStatus::Evicted),
            ),
        });

@@ -1379,6 +1382,7 @@ impl Timeline {
        ancestor: Option<Arc<Timeline>>,
        timeline_id: TimelineId,
        tenant_id: TenantId,
+        generation: Generation,
        walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
        resources: TimelineResources,
        pg_version: u32,
@@ -1408,6 +1412,7 @@ impl Timeline {
                myself: myself.clone(),
                timeline_id,
                tenant_id,
+                generation,
                pg_version,
                layers: Arc::new(tokio::sync::RwLock::new(LayerManager::create())),
                wanted_image_layers: Mutex::new(None),
@@ -1518,7 +1523,7 @@ impl Timeline {
        let layer_flush_start_rx = self.layer_flush_start_tx.subscribe();
        let self_clone = Arc::clone(self);

-        info!("spawning flush loop");
+        debug!("spawning flush loop");
        *flush_loop_state = FlushLoopState::Running {
            #[cfg(test)]
            expect_initdb_optimization: false,
@@ -1589,9 +1594,7 @@ impl Timeline {
        ));
    }

-    ///
    /// Initialize with an empty layer map. Used when creating a new timeline.
-    ///
    pub(super) fn init_empty_layer_map(&self, start_lsn: Lsn) {
        let mut layers = self.layers.try_write().expect(
            "in the context where we call this function, no other task has access to the object",
@@ -1599,10 +1602,16 @@ impl Timeline {
        layers.initialize_empty(Lsn(start_lsn.0));
    }

-    ///
-    /// Scan the timeline directory to populate the layer map.
-    ///
-    pub(super) async fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
+    /// Scan the timeline directory, cleanup, populate the layer map, and schedule uploads for local-only
+    /// files.
+    pub(super) async fn load_layer_map(
+        &self,
+        disk_consistent_lsn: Lsn,
+        index_part: Option<IndexPart>,
+    ) -> anyhow::Result<()> {
+        use init::{Decision::*, Discovered, FutureLayer};
+        use LayerFileName::*;
+
        let mut guard = self.layers.write().await;

        let timer = self.metrics.load_layer_map_histo.start_timer();
@@ -1610,102 +1619,160 @@ impl Timeline {
        // Scan timeline directory and create ImageFileName and DeltaFilename
        // structs representing all files on disk
        let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
-        // total size of layer files in the current timeline directory
-        let mut total_physical_size = 0;
+        let (conf, tenant_id, timeline_id) = (self.conf, self.tenant_id, self.timeline_id);
+        let span = tracing::Span::current();

-        let mut loaded_layers = Vec::<Arc<dyn PersistentLayer>>::new();
+        // Copy to move into the task we're about to spawn
+        let generation = self.generation;

-        for direntry in fs::read_dir(timeline_path)? {
-            let direntry = direntry?;
-            let direntry_path = direntry.path();
-            let fname = direntry.file_name();
-            let fname = fname.to_string_lossy();
+        let (loaded_layers, to_sync, total_physical_size) = tokio::task::spawn_blocking({
+            move || {
+                let _g = span.entered();
+                let discovered = init::scan_timeline_dir(&timeline_path)?;
+                let mut discovered_layers = Vec::with_capacity(discovered.len());
+                let mut unrecognized_files = Vec::new();

-            if let Some(filename) = ImageFileName::parse_str(&fname) {
-                // create an ImageLayer struct for each image file.
-                if filename.lsn > disk_consistent_lsn {
-                    info!(
-                        "found future image layer {} on timeline {} disk_consistent_lsn is {}",
-                        filename, self.timeline_id, disk_consistent_lsn
-                    );
+                let mut path = timeline_path;

-                    rename_to_backup(&direntry_path)?;
-                    continue;
+                for discovered in discovered {
+                    let (name, kind) = match discovered {
+                        Discovered::Layer(file_name, file_size) => {
+                            discovered_layers.push((file_name, file_size));
+                            continue;
+                        }
+                        Discovered::Metadata | Discovered::IgnoredBackup => {
+                            continue;
+                        }
+                        Discovered::Unknown(file_name) => {
+                            // we will later error if there are any
+                            unrecognized_files.push(file_name);
+                            continue;
+                        }
+                        Discovered::Ephemeral(name) => (name, "old ephemeral file"),
+                        Discovered::Temporary(name) => (name, "temporary timeline file"),
+                        Discovered::TemporaryDownload(name) => (name, "temporary download"),
+                    };
+                    path.push(name);
+                    init::cleanup(&path, kind)?;
+                    path.pop();
                }

-                let file_size = direntry_path.metadata()?.len();
-                let stats =
-                    LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident);
-
-                let layer = ImageLayer::new(
-                    self.conf,
-                    self.timeline_id,
-                    self.tenant_id,
-                    &filename,
-                    file_size,
-                    stats,
-                );
-
-                total_physical_size += file_size;
-                loaded_layers.push(Arc::new(layer));
-            } else if let Some(filename) = DeltaFileName::parse_str(&fname) {
-                // Create a DeltaLayer struct for each delta file.
-                // The end-LSN is exclusive, while disk_consistent_lsn is
-                // inclusive. For example, if disk_consistent_lsn is 100, it is
-                // OK for a delta layer to have end LSN 101, but if the end LSN
-                // is 102, then it might not have been fully flushed to disk
-                // before crash.
-                if filename.lsn_range.end > disk_consistent_lsn + 1 {
-                    info!(
-                        "found future delta layer {} on timeline {} disk_consistent_lsn is {}",
-                        filename, self.timeline_id, disk_consistent_lsn
+                if !unrecognized_files.is_empty() {
+                    // assume that if there are any there are many many.
+                    let n = unrecognized_files.len();
+                    let first = &unrecognized_files[..n.min(10)];
+                    anyhow::bail!(
+                        "unrecognized files in timeline dir (total {n}), first 10: {first:?}"
                    );
-
-                    rename_to_backup(&direntry_path)?;
-                    continue;
                }

-                let file_size = direntry_path.metadata()?.len();
-                let stats =
-                    LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident);
-
-                let layer = DeltaLayer::new(
-                    self.conf,
-                    self.timeline_id,
-                    self.tenant_id,
-                    &filename,
-                    file_size,
-                    stats,
+                let decided = init::reconcile(
+                    discovered_layers,
+                    index_part.as_ref(),
+                    disk_consistent_lsn,
+                    generation,
                );

-                total_physical_size += file_size;
-                loaded_layers.push(Arc::new(layer));
-            } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
-                // ignore these
-            } else if remote_timeline_client::is_temp_download_file(&direntry_path) {
-                info!(
-                    "skipping temp download file, reconcile_with_remote will resume / clean up: {}",
-                    fname
-                );
-            } else if is_ephemeral_file(&fname) {
-                // Delete any old ephemeral files
-                trace!("deleting old ephemeral file in timeline dir: {}", fname);
-                fs::remove_file(&direntry_path)?;
-            } else if is_temporary(&direntry_path) {
-                info!("removing temp timeline file at {}", direntry_path.display());
-                fs::remove_file(&direntry_path).with_context(|| {
-                    format!(
-                        "failed to remove temp download file at {}",
-                        direntry_path.display()
-                    )
-                })?;
-            } else {
-                warn!("unrecognized filename in timeline dir: {}", fname);
+                let mut loaded_layers = Vec::new();
+                let mut needs_upload = Vec::new();
+                let mut needs_cleanup = Vec::new();
+                let mut total_physical_size = 0;
+
+                for (name, decision) in decided {
+                    let decision = match decision {
+                        Ok(UseRemote { local, remote }) => {
+                            path.push(name.file_name());
+                            init::cleanup_local_file_for_remote(&path, &local, &remote)?;
+                            path.pop();
+
+                            UseRemote { local, remote }
+                        }
+                        Ok(decision) => decision,
+                        Err(FutureLayer { local }) => {
+                            if local.is_some() {
+                                path.push(name.file_name());
+                                init::cleanup_future_layer(&path, &name, disk_consistent_lsn)?;
+                                path.pop();
+                            }
+                            needs_cleanup.push(name);
+                            continue;
+                        }
+                    };
+
+                    match &name {
+                        Delta(d) => assert!(d.lsn_range.end <= disk_consistent_lsn + 1),
+                        Image(i) => assert!(i.lsn <= disk_consistent_lsn),
+                    }
+
+                    let status = match &decision {
+                        UseLocal(_) | NeedsUpload(_) => LayerResidenceStatus::Resident,
+                        Evicted(_) | UseRemote { .. } => LayerResidenceStatus::Evicted,
+                    };
+
+                    let stats = LayerAccessStats::for_loading_layer(status);
+
+                    let layer: Arc<dyn PersistentLayer> = match (name, &decision) {
+                        (Delta(d), UseLocal(m) | NeedsUpload(m)) => {
+                            total_physical_size += m.file_size();
+                            Arc::new(DeltaLayer::new(
+                                conf,
+                                timeline_id,
+                                tenant_id,
+                                &d,
+                                m.file_size(),
+                                stats,
+                            ))
+                        }
+                        (Image(i), UseLocal(m) | NeedsUpload(m)) => {
+                            total_physical_size += m.file_size();
+                            Arc::new(ImageLayer::new(
+                                conf,
+                                timeline_id,
+                                tenant_id,
+                                &i,
+                                m.file_size(),
+                                stats,
+                            ))
+                        }
+                        (Delta(d), Evicted(remote) | UseRemote { remote, .. }) => Arc::new(
+                            RemoteLayer::new_delta(tenant_id, timeline_id, &d, remote, stats),
+                        ),
+                        (Image(i), Evicted(remote) | UseRemote { remote, .. }) => Arc::new(
+                            RemoteLayer::new_img(tenant_id, timeline_id, &i, remote, stats),
+                        ),
+                    };
+
+                    if let NeedsUpload(m) = decision {
+                        needs_upload.push((layer.clone(), m));
+                    }
+
+                    loaded_layers.push(layer);
+                }
+                Ok((
+                    loaded_layers,
+                    (needs_upload, needs_cleanup),
+                    total_physical_size,
+                ))
            }
-        }
+        })
+        .await
+        .map_err(anyhow::Error::new)
+        .and_then(|x| x)?;

        let num_layers = loaded_layers.len();
-        guard.initialize_local_layers(loaded_layers, Lsn(disk_consistent_lsn.0) + 1);
+
+        guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1);
+
+        if let Some(rtc) = self.remote_client.as_ref() {
+            let (needs_upload, needs_cleanup) = to_sync;
+            for (layer, m) in needs_upload {
+                rtc.schedule_layer_file_upload(&layer.layer_desc().filename(), &m)?;
+            }
+            rtc.schedule_layer_file_deletion(&needs_cleanup)?;
+            rtc.schedule_index_upload_for_file_changes()?;
+            // Tenant::create_timeline will wait for these uploads to happen before returning, or
+            // on retry.
+        }

        info!(
            "loaded layer map with {} layers at {}, total physical size: {}",
@@ -1716,236 +1783,6 @@ impl Timeline {
            .set(total_physical_size);

        timer.stop_and_record();
-
-        Ok(())
-    }
-
-    async fn create_remote_layers(
-        &self,
-        index_part: &IndexPart,
-        local_layers: HashMap<LayerFileName, Arc<dyn PersistentLayer>>,
-        up_to_date_disk_consistent_lsn: Lsn,
-    ) -> anyhow::Result<HashMap<LayerFileName, Arc<dyn PersistentLayer>>> {
-        // Are we missing some files that are present in remote storage?
-        // Create RemoteLayer instances for them.
-        let mut local_only_layers = local_layers;
-
-        // We're holding a layer map lock for a while but this
-        // method is only called during init so it's fine.
-        let mut guard = self.layers.write().await;
-
-        let mut corrupted_local_layers = Vec::new();
-        let mut added_remote_layers = Vec::new();
-        for remote_layer_name in index_part.layer_metadata.keys() {
-            let local_layer = local_only_layers.remove(remote_layer_name);
-
-            let remote_layer_metadata = index_part
-                .layer_metadata
-                .get(remote_layer_name)
-                .map(LayerFileMetadata::from)
-                .with_context(|| {
-                    format!(
-                        "No remote layer metadata found for layer {}",
-                        remote_layer_name.file_name()
-                    )
-                })?;
-
-            // Is the local layer's size different from the size stored in the
-            // remote index file?
-            // If so, rename_to_backup those files & replace their local layer with
-            // a RemoteLayer in the layer map so that we re-download them on-demand.
-            if let Some(local_layer) = local_layer {
-                let local_layer_path = local_layer
-                    .local_path()
-                    .expect("caller must ensure that local_layers only contains local layers");
-                ensure!(
-                    local_layer_path.exists(),
-                    "every layer from local_layers must exist on disk: {}",
-                    local_layer_path.display()
-                );
-
-                let remote_size = remote_layer_metadata.file_size();
-                let metadata = local_layer_path.metadata().with_context(|| {
-                    format!(
-                        "get file size of local layer {}",
-                        local_layer_path.display()
-                    )
-                })?;
-                let local_size = metadata.len();
-                if local_size != remote_size {
-                    warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
-                    if let Err(err) = rename_to_backup(&local_layer_path) {
-                        assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display());
-                        anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
-                    } else {
-                        self.metrics.resident_physical_size_gauge.sub(local_size);
-                        corrupted_local_layers.push(local_layer);
-                        // fall-through to adding the remote layer
-                    }
-                } else {
-                    debug!(
-                        "layer is present locally and file size matches remote, using it: {}",
-                        local_layer_path.display()
-                    );
-                    continue;
-                }
-            }
-
-            info!(
-                "remote layer does not exist locally, creating remote layer: {}",
-                remote_layer_name.file_name()
-            );
-
-            match remote_layer_name {
-                LayerFileName::Image(imgfilename) => {
-                    if imgfilename.lsn > up_to_date_disk_consistent_lsn {
-                        info!(
-                        "found future image layer {} on timeline {} remote_consistent_lsn is {}",
-                        imgfilename, self.timeline_id, up_to_date_disk_consistent_lsn
-                    );
-                        continue;
-                    }
-                    let stats =
-                        LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted);
-
-                    let remote_layer = RemoteLayer::new_img(
-                        self.tenant_id,
-                        self.timeline_id,
-                        imgfilename,
-                        &remote_layer_metadata,
-                        stats,
-                    );
-                    let remote_layer = Arc::new(remote_layer);
-                    added_remote_layers.push(remote_layer);
-                }
-                LayerFileName::Delta(deltafilename) => {
-                    // Create a RemoteLayer for the delta file.
-                    // The end-LSN is exclusive, while disk_consistent_lsn is
-                    // inclusive. For example, if disk_consistent_lsn is 100, it is
-                    // OK for a delta layer to have end LSN 101, but if the end LSN
-                    // is 102, then it might not have been fully flushed to disk
-                    // before crash.
-                    if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 {
-                        info!(
-                            "found future delta layer {} on timeline {} remote_consistent_lsn is {}",
-                            deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn
-                        );
-                        continue;
-                    }
-                    let stats =
-                        LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted);
-
-                    let remote_layer = RemoteLayer::new_delta(
-                        self.tenant_id,
-                        self.timeline_id,
-                        deltafilename,
-                        &remote_layer_metadata,
-                        stats,
-                    );
-                    let remote_layer = Arc::new(remote_layer);
-                    added_remote_layers.push(remote_layer);
-                }
-            }
-        }
-        guard.initialize_remote_layers(corrupted_local_layers, added_remote_layers);
-        Ok(local_only_layers)
-    }
-
-    /// This function will synchronize local state with what we have in remote storage.
-    ///
-    /// Steps taken:
-    /// 1. Initialize upload queue based on `index_part`.
-    /// 2. Create `RemoteLayer` instances for layers that exist only on the remote.
-    ///    The list of layers on the remote comes from `index_part`.
-    ///    The list of local layers is given by the layer map's `iter_historic_layers()`.
-    ///    So, the layer map must have been loaded already.
-    /// 3. Schedule upload of local-only layer files (which will then also update the remote
-    ///    IndexPart to include the new layer files).
-    ///
-    /// Refer to the [`remote_timeline_client`] module comment for more context.
-    ///
-    /// # TODO
-    /// May be a bit cleaner to do things based on populated remote client,
-    /// and then do things based on its upload_queue.latest_files.
-    #[instrument(skip(self, index_part, up_to_date_metadata))]
-    pub async fn reconcile_with_remote(
-        &self,
-        up_to_date_metadata: &TimelineMetadata,
-        index_part: Option<&IndexPart>,
-    ) -> anyhow::Result<()> {
-        info!("starting");
-        let remote_client = self
-            .remote_client
-            .as_ref()
-            .ok_or_else(|| anyhow!("cannot download without remote storage"))?;
-
-        let disk_consistent_lsn = up_to_date_metadata.disk_consistent_lsn();
-
-        let local_layers = {
-            let guard = self.layers.read().await;
-            let layers = guard.layer_map();
-            layers
-                .iter_historic_layers()
-                .map(|l| (l.filename(), guard.get_from_desc(&l)))
-                .collect::<HashMap<_, _>>()
-        };
-
-        // If no writes happen, new branches do not have any layers, only the metadata file.
-        let has_local_layers = !local_layers.is_empty();
-        let local_only_layers = match index_part {
-            Some(index_part) => {
-                info!(
-                    "initializing upload queue from remote index with {} layer files",
-                    index_part.layer_metadata.len()
-                );
-                remote_client.init_upload_queue(index_part)?;
-                self.create_remote_layers(index_part, local_layers, disk_consistent_lsn)
-                    .await?
-            }
-            None => {
-                info!("initializing upload queue as empty");
-                remote_client.init_upload_queue_for_empty_remote(up_to_date_metadata)?;
-                local_layers
-            }
-        };
-
-        if has_local_layers {
-            // Are there local files that don't exist remotely? Schedule uploads for them.
-            // Local timeline metadata will get uploaded to remove along witht he layers.
-            for (layer_name, layer) in &local_only_layers {
-                // XXX solve this in the type system
-                let layer_path = layer
-                    .local_path()
-                    .expect("local_only_layers only contains local layers");
-                let layer_size = layer_path
-                    .metadata()
-                    .with_context(|| format!("failed to get file {layer_path:?} metadata"))?
-                    .len();
-                info!("scheduling {layer_path:?} for upload");
-                remote_client
-                    .schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?;
-            }
-            remote_client.schedule_index_upload_for_file_changes()?;
-        } else if index_part.is_none() {
-            // No data on the remote storage, no local layers, local metadata file.
-            //
-            // TODO https://github.com/neondatabase/neon/issues/3865
-            // Currently, console does not wait for the timeline data upload to the remote storage
-            // and considers the timeline created, expecting other pageserver nodes to work with it.
-            // Branch metadata upload could get interrupted (e.g pageserver got killed),
-            // hence any locally existing branch metadata with no remote counterpart should be uploaded,
-            // otherwise any other pageserver won't see the branch on `attach`.
-            //
-            // After the issue gets implemented, pageserver should rather remove the branch,
-            // since absence on S3 means we did not acknowledge the branch creation and console will have to retry,
-            // no need to keep the old files.
-            remote_client.schedule_index_upload_for_metadata_update(up_to_date_metadata)?;
-        } else {
-            // Local timeline has a metadata file, remote one too, both have no layers to sync.
-        }
-
-        info!("Done");
-
        Ok(())
    }

@@ -2442,7 +2279,15 @@ impl Timeline {
                        )));
                    }
                }
-                ancestor.wait_lsn(timeline.ancestor_lsn, ctx).await?;
+                ancestor
+                    .wait_lsn(timeline.ancestor_lsn, ctx)
+                    .await
+                    .with_context(|| {
+                        format!(
+                            "wait for lsn {} on ancestor timeline_id={}",
+                            timeline.ancestor_lsn, ancestor.timeline_id
+                        )
+                    })?;

                timeline_owned = ancestor;
                timeline = &*timeline_owned;
@@ -2621,13 +2466,14 @@ impl Timeline {
        }
    }

-    fn lookup_cached_page(&self, key: &Key, lsn: Lsn) -> Option<(Lsn, Bytes)> {
+    async fn lookup_cached_page(&self, key: &Key, lsn: Lsn) -> Option<(Lsn, Bytes)> {
        let cache = page_cache::get();

        // FIXME: It's pointless to check the cache for things that are not 8kB pages.
        // We should look at the key to determine if it's a cacheable object
-        let (lsn, read_guard) =
-            cache.lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn)?;
+        let (lsn, read_guard) = cache
+            .lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn)
+            .await?;
        let img = Bytes::from(read_guard.to_vec());
        Some((lsn, img))
    }
@@ -2837,7 +2683,7 @@ impl Timeline {
                (
                    HashMap::from([(
                        layer.filename(),
-                        LayerFileMetadata::new(layer.layer_desc().file_size),
+                        LayerFileMetadata::new(layer.layer_desc().file_size, self.generation),
                    )]),
                    Some(layer),
                )
@@ -2852,7 +2698,6 @@ impl Timeline {
            if let Some(ref l) = delta_layer_to_add {
                // TODO: move access stats, metrics update, etc. into layer manager.
                l.access_stats().record_residence_event(
-                    &guard,
                    LayerResidenceStatus::Resident,
                    LayerResidenceEventReason::LayerCreate,
                );
@@ -3234,14 +3079,16 @@ impl Timeline {
                .metadata()
                .with_context(|| format!("reading metadata of layer file {}", path.file_name()))?;

-            layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len()));
+            layer_paths_to_upload.insert(
+                path,
+                LayerFileMetadata::new(metadata.len(), self.generation),
+            );

            self.metrics
                .resident_physical_size_gauge
                .add(metadata.len());
            let l = Arc::new(l);
            l.access_stats().record_residence_event(
-                &guard,
                LayerResidenceStatus::Resident,
                LayerResidenceEventReason::LayerCreate,
            );
@@ -3910,7 +3757,7 @@ impl Timeline {
            if let Some(remote_client) = &self.remote_client {
                remote_client.schedule_layer_file_upload(
                    &l.filename(),
-                    &LayerFileMetadata::new(metadata.len()),
+                    &LayerFileMetadata::new(metadata.len(), self.generation),
                )?;
            }

@@ -3919,9 +3766,11 @@ impl Timeline {
                .resident_physical_size_gauge
                .add(metadata.len());

-            new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
+            new_layer_paths.insert(
+                new_delta_path,
+                LayerFileMetadata::new(metadata.len(), self.generation),
+            );
            l.access_stats().record_residence_event(
-                &guard,
                LayerResidenceStatus::Resident,
                LayerResidenceEventReason::LayerCreate,
            );
@@ -4312,7 +4161,7 @@ impl Timeline {
    ///
    /// Reconstruct a value, using the given base image and WAL records in 'data'.
    ///
-    fn reconstruct_value(
+    async fn reconstruct_value(
        &self,
        key: Key,
        request_lsn: Lsn,
@@ -4381,6 +4230,7 @@ impl Timeline {
                            last_rec_lsn,
                            &img,
                        )
+                        .await
                        .context("Materialized page memoization failed")
                    {
                        return Err(PageReconstructError::from(e));
@@ -4840,7 +4690,8 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> {
    for i in 0u32.. {
        new_path.set_file_name(format!("{filename}.{i}.old"));
        if !new_path.exists() {
-            std::fs::rename(path, &new_path)?;
+            std::fs::rename(path, &new_path)
+                .with_context(|| format!("rename {path:?} to {new_path:?}"))?;
            return Ok(());
        }
    }
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -0,0 +1,213 @@
+use crate::{
+    is_temporary,
+    tenant::{
+        ephemeral_file::is_ephemeral_file,
+        remote_timeline_client::{
+            self,
+            index::{IndexPart, LayerFileMetadata},
+        },
+        storage_layer::LayerFileName,
+        Generation,
+    },
+    METADATA_FILE_NAME,
+};
+use anyhow::Context;
+use std::{collections::HashMap, ffi::OsString, path::Path, str::FromStr};
+use utils::lsn::Lsn;
+
+/// Identified files in the timeline directory.
+pub(super) enum Discovered {
+    /// The only one we care about
+    Layer(LayerFileName, u64),
+    /// Old ephmeral files from previous launches, should be removed
+    Ephemeral(OsString),
+    /// Old temporary timeline files, unsure what these really are, should be removed
+    Temporary(OsString),
+    /// Temporary on-demand download files, should be removed
+    TemporaryDownload(OsString),
+    /// "metadata" file we persist locally and include in `index_part.json`
+    Metadata,
+    /// Backup file from previously future layers
+    IgnoredBackup,
+    /// Unrecognized, warn about these
+    Unknown(OsString),
+}
+
+/// Scans the timeline directory for interesting files.
+pub(super) fn scan_timeline_dir(path: &Path) -> anyhow::Result<Vec<Discovered>> {
+    let mut ret = Vec::new();
+
+    for direntry in std::fs::read_dir(path)? {
+        let direntry = direntry?;
+        let direntry_path = direntry.path();
+        let file_name = direntry.file_name();
+
+        let fname = file_name.to_string_lossy();
+
+        let discovered = match LayerFileName::from_str(&fname) {
+            Ok(file_name) => {
+                let file_size = direntry.metadata()?.len();
+                Discovered::Layer(file_name, file_size)
+            }
+            Err(_) => {
+                if fname == METADATA_FILE_NAME {
+                    Discovered::Metadata
+                } else if fname.ends_with(".old") {
+                    // ignore these
+                    Discovered::IgnoredBackup
+                } else if remote_timeline_client::is_temp_download_file(&direntry_path) {
+                    Discovered::TemporaryDownload(file_name)
+                } else if is_ephemeral_file(&fname) {
+                    Discovered::Ephemeral(file_name)
+                } else if is_temporary(&direntry_path) {
+                    Discovered::Temporary(file_name)
+                } else {
+                    Discovered::Unknown(file_name)
+                }
+            }
+        };
+
+        ret.push(discovered);
+    }
+
+    Ok(ret)
+}
+
+/// Decision on what to do with a layer file after considering its local and remote metadata.
+#[derive(Clone)]
+pub(super) enum Decision {
+    /// The layer is not present locally.
+    Evicted(LayerFileMetadata),
+    /// The layer is present locally, but local metadata does not match remote; we must
+    /// delete it and treat it as evicted.
+    UseRemote {
+        local: LayerFileMetadata,
+        remote: LayerFileMetadata,
+    },
+    /// The layer is present locally, and metadata matches.
+    UseLocal(LayerFileMetadata),
+    /// The layer is only known locally, it needs to be uploaded.
+    NeedsUpload(LayerFileMetadata),
+}
+
+/// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded.
+#[derive(Debug)]
+pub(super) struct FutureLayer {
+    /// The local metadata. `None` if the layer is only known through [`IndexPart`].
+    pub(super) local: Option<LayerFileMetadata>,
+}
+
+/// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
+///
+/// This function should not gain additional reasons to fail than [`FutureLayer`], consider adding
+/// the checks earlier to [`scan_timeline_dir`].
+pub(super) fn reconcile(
+    discovered: Vec<(LayerFileName, u64)>,
+    index_part: Option<&IndexPart>,
+    disk_consistent_lsn: Lsn,
+    generation: Generation,
+) -> Vec<(LayerFileName, Result<Decision, FutureLayer>)> {
+    use Decision::*;
+
+    // name => (local, remote)
+    type Collected = HashMap<LayerFileName, (Option<LayerFileMetadata>, Option<LayerFileMetadata>)>;
+
+    let mut discovered = discovered
+        .into_iter()
+        .map(|(name, file_size)| {
+            (
+                name,
+                // The generation here will be corrected to match IndexPart in the merge below, unless
+                // it is not in IndexPart, in which case using our current generation makes sense
+                // because it will be uploaded in this generation.
+                (Some(LayerFileMetadata::new(file_size, generation)), None),
+            )
+        })
+        .collect::<Collected>();
+
+    // merge any index_part information, when available
+    index_part
+        .as_ref()
+        .map(|ip| ip.layer_metadata.iter())
+        .into_iter()
+        .flatten()
+        .map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
+        .for_each(|(name, metadata)| {
+            if let Some(existing) = discovered.get_mut(name) {
+                existing.1 = Some(metadata);
+            } else {
+                discovered.insert(name.to_owned(), (None, Some(metadata)));
+            }
+        });
+
+    discovered
+        .into_iter()
+        .map(|(name, (local, remote))| {
+            let decision = if name.is_in_future(disk_consistent_lsn) {
+                Err(FutureLayer { local })
+            } else {
+                Ok(match (local, remote) {
+                    (Some(local), Some(remote)) if local != remote => {
+                        assert_eq!(local.generation, remote.generation);
+
+                        UseRemote { local, remote }
+                    }
+                    (Some(x), Some(_)) => UseLocal(x),
+                    (None, Some(x)) => Evicted(x),
+                    (Some(x), None) => NeedsUpload(x),
+                    (None, None) => {
+                        unreachable!("there must not be any non-local non-remote files")
+                    }
+                })
+            };
+
+            (name, decision)
+        })
+        .collect::<Vec<_>>()
+}
+
+pub(super) fn cleanup(path: &Path, kind: &str) -> anyhow::Result<()> {
+    let file_name = path.file_name().expect("must be file path");
+    tracing::debug!(kind, ?file_name, "cleaning up");
+    std::fs::remove_file(path)
+        .with_context(|| format!("failed to remove {kind} at {}", path.display()))
+}
+
+pub(super) fn cleanup_local_file_for_remote(
+    path: &Path,
+    local: &LayerFileMetadata,
+    remote: &LayerFileMetadata,
+) -> anyhow::Result<()> {
+    let local_size = local.file_size();
+    let remote_size = remote.file_size();
+
+    let file_name = path.file_name().expect("must be file path");
+    tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
+    if let Err(err) = crate::tenant::timeline::rename_to_backup(path) {
+        assert!(
+            path.exists(),
+            "we would leave the local_layer without a file if this does not hold: {}",
+            path.display()
+        );
+        Err(err)
+    } else {
+        Ok(())
+    }
+}
+
+pub(super) fn cleanup_future_layer(
+    path: &Path,
+    name: &LayerFileName,
+    disk_consistent_lsn: Lsn,
+) -> anyhow::Result<()> {
+    use LayerFileName::*;
+    let kind = match name {
+        Delta(_) => "delta",
+        Image(_) => "image",
+    };
+    // future image layers are allowed to be produced always for not yet flushed to disk
+    // lsns stored in InMemoryLayer.
+    tracing::info!("found future {kind} layer {name} disk_consistent_lsn is {disk_consistent_lsn}");
+    crate::tenant::timeline::rename_to_backup(path)?;
+    Ok(())
+}
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -13,7 +13,7 @@ use crate::{
        layer_map::{BatchedUpdates, LayerMap},
        storage_layer::{
            AsLayerDesc, DeltaLayer, ImageLayer, InMemoryLayer, PersistentLayer,
-            PersistentLayerDesc, PersistentLayerKey, RemoteLayer,
+            PersistentLayerDesc, PersistentLayerKey,
        },
        timeline::compare_arced_layers,
    },
@@ -85,21 +85,6 @@ impl LayerManager {
        self.layer_map.next_open_layer_at = Some(next_open_layer_at);
    }

-    pub(crate) fn initialize_remote_layers(
-        &mut self,
-        corrupted_local_layers: Vec<Arc<dyn PersistentLayer>>,
-        remote_layers: Vec<Arc<RemoteLayer>>,
-    ) {
-        let mut updates = self.layer_map.batch_update();
-        for layer in corrupted_local_layers {
-            Self::remove_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
-        }
-        for layer in remote_layers {
-            Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
-        }
-        updates.flush();
-    }
-
    /// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
    /// called within `get_layer_for_write`.
    pub(crate) fn get_layer_for_write(
@@ -265,16 +250,6 @@ impl LayerManager {
        mapping.insert(layer);
    }

-    /// Helper function to remove a layer into the layer map and file manager
-    fn remove_historic_layer(
-        layer: Arc<dyn PersistentLayer>,
-        updates: &mut BatchedUpdates<'_>,
-        mapping: &mut LayerFileManager,
-    ) {
-        updates.remove_historic(layer.layer_desc());
-        mapping.remove(layer);
-    }
-
    /// Removes the layer from local FS (if present) and from memory.
    /// Remote storage is not affected by this operation.
    fn delete_historic_layer(
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -17,7 +17,7 @@ use crate::metrics::{
    WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
    WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
 };
-use crate::task_mgr::TaskKind;
+use crate::task_mgr::{shutdown_token, TaskKind};
 use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
@@ -31,10 +31,11 @@ use storage_broker::Streaming;
 use tokio::select;
 use tracing::*;

-use postgres_connection::{parse_host_port, PgConnectionConfig};
+use postgres_connection::PgConnectionConfig;
 use utils::backoff::{
    exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
+use utils::postgres_client::wal_stream_connection_config;
 use utils::{
    id::{NodeId, TenantTimelineId},
    lsn::Lsn,
@@ -211,11 +212,14 @@ async fn subscribe_for_timeline_updates(
    id: TenantTimelineId,
 ) -> Streaming<SafekeeperTimelineInfo> {
    let mut attempt = 0;
+    let cancel = shutdown_token();
+
    loop {
        exponential_backoff(
            attempt,
            DEFAULT_BASE_BACKOFF_SECONDS,
            DEFAULT_MAX_BACKOFF_SECONDS,
+            &cancel,
        )
        .await;
        attempt += 1;
@@ -876,33 +880,6 @@ impl ReconnectReason {
    }
 }

-fn wal_stream_connection_config(
-    TenantTimelineId {
-        tenant_id,
-        timeline_id,
-    }: TenantTimelineId,
-    listen_pg_addr_str: &str,
-    auth_token: Option<&str>,
-    availability_zone: Option<&str>,
-) -> anyhow::Result<PgConnectionConfig> {
-    let (host, port) =
-        parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
-    let port = port.unwrap_or(5432);
-    let mut connstr = PgConnectionConfig::new_host_port(host, port)
-        .extend_options([
-            "-c".to_owned(),
-            format!("timeline_id={}", timeline_id),
-            format!("tenant_id={}", tenant_id),
-        ])
-        .set_password(auth_token.map(|s| s.to_owned()));
-
-    if let Some(availability_zone) = availability_zone {
-        connstr = connstr.extend_options([format!("availability_zone={}", availability_zone)]);
-    }
-
-    Ok(connstr)
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -918,6 +895,7 @@ mod tests {
            timeline: SafekeeperTimelineInfo {
                safekeeper_id: 0,
                tenant_timeline_id: None,
+                term: 0,
                last_log_term: 0,
                flush_lsn: 0,
                commit_lsn,
@@ -926,6 +904,7 @@ mod tests {
                peer_horizon_lsn: 0,
                local_start_lsn: 0,
                safekeeper_connstr: safekeeper_connstr.to_owned(),
+                http_connstr: safekeeper_connstr.to_owned(),
                availability_zone: None,
            },
            latest_update,
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -1,6 +1,7 @@
 use crate::metrics::RemoteOpFileKind;

 use super::storage_layer::LayerFileName;
+use super::Generation;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
@@ -148,17 +149,16 @@ impl UploadQueue {
            );
        }

-        let index_part_metadata = index_part.parse_metadata()?;
        info!(
            "initializing upload queue with remote index_part.disk_consistent_lsn: {}",
-            index_part_metadata.disk_consistent_lsn()
+            index_part.metadata.disk_consistent_lsn()
        );

        let state = UploadQueueInitialized {
            latest_files: files,
            latest_files_changes_since_metadata_upload_scheduled: 0,
-            latest_metadata: index_part_metadata.clone(),
-            last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
+            latest_metadata: index_part.metadata.clone(),
+            last_uploaded_consistent_lsn: index_part.metadata.disk_consistent_lsn(),
            // what follows are boring default initializations
            task_counter: 0,
            num_inprogress_layer_uploads: 0,
@@ -206,6 +206,7 @@ pub(crate) struct Delete {
    pub(crate) file_kind: RemoteOpFileKind,
    pub(crate) layer_file_name: LayerFileName,
    pub(crate) scheduled_from_timeline_delete: bool,
+    pub(crate) generation: Generation,
 }

 #[derive(Debug)]
@@ -229,17 +230,21 @@ impl std::fmt::Display for UploadOp {
            UploadOp::UploadLayer(path, metadata) => {
                write!(
                    f,
-                    "UploadLayer({}, size={:?})",
+                    "UploadLayer({}, size={:?}, gen={:?})",
                    path.file_name(),
-                    metadata.file_size()
+                    metadata.file_size(),
+                    metadata.generation,
                )
            }
-            UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
+            UploadOp::UploadMetadata(_, lsn) => {
+                write!(f, "UploadMetadata(lsn: {})", lsn)
+            }
            UploadOp::Delete(delete) => write!(
                f,
-                "Delete(path: {}, scheduled_from_timeline_delete: {})",
+                "Delete(path: {}, scheduled_from_timeline_delete: {}, gen: {:?})",
                delete.layer_file_name.file_name(),
-                delete.scheduled_from_timeline_delete
+                delete.scheduled_from_timeline_delete,
+                delete.generation
            ),
            UploadOp::Barrier(_) => write!(f, "Barrier"),
        }
--- a/proxy/src/auth/password_hack.rs
+++ b/proxy/src/auth/password_hack.rs
@@ -12,13 +12,19 @@ pub struct PasswordHackPayload {

 impl PasswordHackPayload {
    pub fn parse(bytes: &[u8]) -> Option<Self> {
-        // The format is `project=<utf-8>;<password-bytes>`.
-        let mut iter = bytes.splitn_str(2, ";");
-        let endpoint = iter.next()?.to_str().ok()?;
-        let endpoint = parse_endpoint_param(endpoint)?.to_owned();
-        let password = iter.next()?.to_owned();
+        // The format is `project=<utf-8>;<password-bytes>` or `project=<utf-8>$<password-bytes>`.
+        let separators = [";", "$"];
+        for sep in separators {
+            if let Some((endpoint, password)) = bytes.split_once_str(sep) {
+                let endpoint = endpoint.to_str().ok()?;
+                return Some(Self {
+                    endpoint: parse_endpoint_param(endpoint)?.to_owned(),
+                    password: password.to_owned(),
+                });
+            }
+        }

-        Some(Self { endpoint, password })
+        None
    }
 }

@@ -91,4 +97,23 @@ mod tests {
        assert_eq!(payload.endpoint, "foobar");
        assert_eq!(payload.password, b"pass;word");
    }
+
+    #[test]
+    fn parse_password_hack_payload_dollar() {
+        let bytes = b"";
+        assert!(PasswordHackPayload::parse(bytes).is_none());
+
+        let bytes = b"endpoint=";
+        assert!(PasswordHackPayload::parse(bytes).is_none());
+
+        let bytes = b"endpoint=$";
+        let payload = PasswordHackPayload::parse(bytes).expect("parsing failed");
+        assert_eq!(payload.endpoint, "");
+        assert_eq!(payload.password, b"");
+
+        let bytes = b"endpoint=foobar$pass$word";
+        let payload = PasswordHackPayload::parse(bytes).expect("parsing failed");
+        assert_eq!(payload.endpoint, "foobar");
+        assert_eq!(payload.password, b"pass$word");
+    }
 }
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -16,12 +16,21 @@ use tracing::{error, info, info_span, warn, Instrument};
 pub struct Api {
    endpoint: http::Endpoint,
    caches: &'static ApiCaches,
+    jwt: String,
 }

 impl Api {
    /// Construct an API object containing the auth parameters.
    pub fn new(endpoint: http::Endpoint, caches: &'static ApiCaches) -> Self {
-        Self { endpoint, caches }
+        let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
+            Ok(v) => v,
+            Err(_) => "".to_string(),
+        };
+        Self {
+            endpoint,
+            caches,
+            jwt,
+        }
    }

    pub fn url(&self) -> &str {
@@ -39,6 +48,7 @@ impl Api {
                .endpoint
                .get("proxy_get_role_secret")
                .header("X-Request-ID", &request_id)
+                .header("Authorization", &self.jwt)
                .query(&[("session_id", extra.session_id)])
                .query(&[
                    ("application_name", extra.application_name),
@@ -83,6 +93,7 @@ impl Api {
                .endpoint
                .get("proxy_wake_compute")
                .header("X-Request-ID", &request_id)
+                .header("Authorization", &self.jwt)
                .query(&[("session_id", extra.session_id)])
                .query(&[
                    ("application_name", extra.application_name),
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -408,9 +408,9 @@ async fn connect_to_compute_once(
    let (tx, mut rx) = tokio::sync::watch::channel(session);

    let conn_id = uuid::Uuid::new_v4();
-    let span = info_span!(parent: None, "connection", %conn_info, %conn_id);
+    let span = info_span!(parent: None, "connection", %conn_id);
    span.in_scope(|| {
-        info!(%session, "new connection");
+        info!(%conn_info, %session, "new connection");
    });

    tokio::spawn(
@@ -420,26 +420,28 @@ async fn connect_to_compute_once(
                info!(%session, "changed session");
            }

-            let message = ready!(connection.poll_message(cx));
+            loop {
+                let message = ready!(connection.poll_message(cx));

-            match message {
-                Some(Ok(AsyncMessage::Notice(notice))) => {
-                    info!(%session, "notice: {}", notice);
-                    Poll::Pending
+                match message {
+                    Some(Ok(AsyncMessage::Notice(notice))) => {
+                        info!(%session, "notice: {}", notice);
+                    }
+                    Some(Ok(AsyncMessage::Notification(notif))) => {
+                        warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
+                    }
+                    Some(Ok(_)) => {
+                        warn!(%session, "unknown message");
+                    }
+                    Some(Err(e)) => {
+                        error!(%session, "connection error: {}", e);
+                        return Poll::Ready(())
+                    }
+                    None => {
+                        info!("connection closed");
+                        return Poll::Ready(())
+                    }
                }
-                Some(Ok(AsyncMessage::Notification(notif))) => {
-                    warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
-                    Poll::Pending
-                }
-                Some(Ok(_)) => {
-                    warn!(%session, "unknown message");
-                    Poll::Pending
-                }
-                Some(Err(e)) => {
-                    error!(%session, "connection error: {}", e);
-                    Poll::Ready(())
-                }
-                None => Poll::Ready(()),
            }
        })
        .instrument(span)
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -304,7 +304,7 @@ pub async fn task_main(

    let make_svc =
        hyper::service::make_service_fn(|stream: &tokio_rustls::server::TlsStream<AddrStream>| {
-            let sni_name = stream.get_ref().1.sni_hostname().map(|s| s.to_string());
+            let sni_name = stream.get_ref().1.server_name().map(|s| s.to_string());
            let conn_pool = conn_pool.clone();

            async move {
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -141,7 +141,7 @@ impl<S> Stream<S> {
    pub fn sni_hostname(&self) -> Option<&str> {
        match self {
            Stream::Raw { .. } => None,
-            Stream::Tls { tls } => tls.get_ref().1.sni_hostname(),
+            Stream::Tls { tls } => tls.get_ref().1.server_name(),
        }
    }
 }
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -341,21 +341,35 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {

    let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);

-    // Load all timelines from disk to memory.
-    GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?;
-
    // Keep handles to main tasks to die if any of them disappears.
    let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
        FuturesUnordered::new();

+    // Start wal backup launcher before loading timelines as we'll notify it
+    // through the channel about timelines which need offloading, not draining
+    // the channel would cause deadlock.
+    let current_thread_rt = conf
+        .current_thread_runtime
+        .then(|| Handle::try_current().expect("no runtime in main"));
+    let conf_ = conf.clone();
+    let wal_backup_handle = current_thread_rt
+        .as_ref()
+        .unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle())
+        .spawn(wal_backup::wal_backup_launcher_task_main(
+            conf_,
+            wal_backup_launcher_rx,
+        ))
+        .map(|res| ("WAL backup launcher".to_owned(), res));
+    tasks_handles.push(Box::pin(wal_backup_handle));
+
+    // Load all timelines from disk to memory.
+    GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx).await?;
+
    let conf_ = conf.clone();
    // Run everything in current thread rt, if asked.
    if conf.current_thread_runtime {
        info!("running in current thread runtime");
    }
-    let current_thread_rt = conf
-        .current_thread_runtime
-        .then(|| Handle::try_current().expect("no runtime in main"));

    let wal_service_handle = current_thread_rt
        .as_ref()
@@ -408,17 +422,6 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
        .map(|res| ("WAL remover".to_owned(), res));
    tasks_handles.push(Box::pin(wal_remover_handle));

-    let conf_ = conf.clone();
-    let wal_backup_handle = current_thread_rt
-        .as_ref()
-        .unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle())
-        .spawn(wal_backup::wal_backup_launcher_task_main(
-            conf_,
-            wal_backup_launcher_rx,
-        ))
-        .map(|res| ("WAL backup launcher".to_owned(), res));
-    tasks_handles.push(Box::pin(wal_backup_handle));
-
    set_build_info_metric(GIT_VERSION);

    // TODO: update tokio-stream, convert to real async Stream with
--- a/safekeeper/src/control_file_upgrade.rs
+++ b/safekeeper/src/control_file_upgrade.rs
@@ -1,7 +1,6 @@
 //! Code to deal with safekeeper control file upgrades
 use crate::safekeeper::{
-    AcceptorState, PersistedPeers, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory,
-    TermSwitchEntry,
+    AcceptorState, PersistedPeers, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory, TermLsn,
 };
 use anyhow::{bail, Result};
 use pq_proto::SystemId;
@@ -145,7 +144,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
        let oldstate = SafeKeeperStateV1::des(&buf[..buf.len()])?;
        let ac = AcceptorState {
            term: oldstate.acceptor_state.term,
-            term_history: TermHistory(vec![TermSwitchEntry {
+            term_history: TermHistory(vec![TermLsn {
                term: oldstate.acceptor_state.epoch,
                lsn: Lsn(0),
            }]),
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -19,6 +19,7 @@ use crate::receive_wal::WalReceiverState;
 use crate::safekeeper::ServerInfo;
 use crate::safekeeper::Term;
 use crate::send_wal::WalSenderState;
+use crate::timeline::PeerInfo;
 use crate::{debug_dump, pull_timeline};

 use crate::timelines_global_map::TimelineDeleteForceResult;
@@ -101,6 +102,7 @@ pub struct TimelineStatus {
    pub peer_horizon_lsn: Lsn,
    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn: Lsn,
+    pub peers: Vec<PeerInfo>,
    pub walsenders: Vec<WalSenderState>,
    pub walreceivers: Vec<WalReceiverState>,
 }
@@ -140,6 +142,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
        term_history,
    };

+    let conf = get_conf(&request);
    // Note: we report in memory values which can be lost.
    let status = TimelineStatus {
        tenant_id: ttid.tenant_id,
@@ -153,6 +156,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
        backup_lsn: inmem.backup_lsn,
        peer_horizon_lsn: inmem.peer_horizon_lsn,
        remote_consistent_lsn: tli.get_walsenders().get_remote_consistent_lsn(),
+        peers: tli.get_peers(conf).await,
        walsenders: tli.get_walsenders().get_all(),
        walreceivers: tli.get_walreceivers().get_all(),
    };
@@ -282,12 +286,14 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
            tenant_id: ttid.tenant_id.as_ref().to_owned(),
            timeline_id: ttid.timeline_id.as_ref().to_owned(),
        }),
+        term: sk_info.term.unwrap_or(0),
        last_log_term: sk_info.last_log_term.unwrap_or(0),
        flush_lsn: sk_info.flush_lsn.0,
        commit_lsn: sk_info.commit_lsn.0,
        remote_consistent_lsn: sk_info.remote_consistent_lsn.0,
        peer_horizon_lsn: sk_info.peer_horizon_lsn.0,
        safekeeper_connstr: sk_info.safekeeper_connstr.unwrap_or_else(|| "".to_owned()),
+        http_connstr: sk_info.http_connstr.unwrap_or_else(|| "".to_owned()),
        backup_lsn: sk_info.backup_lsn.0,
        local_start_lsn: sk_info.local_start_lsn.0,
        availability_zone: None,
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -21,7 +21,7 @@ use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo};
 use crate::safekeeper::{
    AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected,
 };
-use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermSwitchEntry};
+use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermLsn};
 use crate::timeline::Timeline;
 use crate::GlobalTimelines;
 use postgres_backend::PostgresBackend;
@@ -119,7 +119,7 @@ async fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> any
    let history = tli.get_state().await.1.acceptor_state.term_history;
    let history = history.up_to(lsn.checked_sub(1u64).unwrap());
    let mut history_entries = history.0;
-    history_entries.push(TermSwitchEntry { term, lsn });
+    history_entries.push(TermLsn { term, lsn });
    let history = TermHistory(history_entries);

    let proposer_elected_request = ProposerAcceptorMessage::Elected(ProposerElected {
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -19,6 +19,7 @@ pub mod json_ctrl;
 pub mod metrics;
 pub mod pull_timeline;
 pub mod receive_wal;
+pub mod recovery;
 pub mod remove_wal;
 pub mod safekeeper;
 pub mod send_wal;
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -227,7 +227,9 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
    tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?;
    tokio::fs::rename(tli_dir_path, &timeline_path).await?;

-    let tli = GlobalTimelines::load_timeline(ttid).context("Failed to load timeline after copy")?;
+    let tli = GlobalTimelines::load_timeline(ttid)
+        .await
+        .context("Failed to load timeline after copy")?;

    info!(
        "Loaded timeline {}, flush_lsn={}",
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -0,0 +1,40 @@
+//! This module implements pulling WAL from peer safekeepers if compute can't
+//! provide it, i.e. safekeeper lags too much.
+
+use std::sync::Arc;
+
+use tokio::{select, time::sleep, time::Duration};
+use tracing::{info, instrument};
+
+use crate::{timeline::Timeline, SafeKeeperConf};
+
+/// Entrypoint for per timeline task which always runs, checking whether
+/// recovery for this safekeeper is needed and starting it if so.
+#[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
+pub async fn recovery_main(tli: Arc<Timeline>, _conf: SafeKeeperConf) {
+    info!("started");
+    let mut cancellation_rx = match tli.get_cancellation_rx() {
+        Ok(rx) => rx,
+        Err(_) => {
+            info!("timeline canceled during task start");
+            return;
+        }
+    };
+
+    select! {
+        _ = recovery_main_loop(tli) => { unreachable!() }
+        _ = cancellation_rx.changed() => {
+            info!("stopped");
+        }
+    }
+}
+
+const CHECK_INTERVAL_MS: u64 = 2000;
+
+/// Check regularly whether we need to start recovery.
+async fn recovery_main_loop(_tli: Arc<Timeline>) {
+    let check_duration = Duration::from_millis(CHECK_INTERVAL_MS);
+    loop {
+        sleep(check_duration).await;
+    }
+}
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -34,22 +34,33 @@ pub const UNKNOWN_SERVER_VERSION: u32 = 0;

 /// Consensus logical timestamp.
 pub type Term = u64;
-const INVALID_TERM: Term = 0;
+pub const INVALID_TERM: Term = 0;

-#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
-pub struct TermSwitchEntry {
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
+pub struct TermLsn {
    pub term: Term,
    pub lsn: Lsn,
 }
+
+// Creation from tuple provides less typing (e.g. for unit tests).
+impl From<(Term, Lsn)> for TermLsn {
+    fn from(pair: (Term, Lsn)) -> TermLsn {
+        TermLsn {
+            term: pair.0,
+            lsn: pair.1,
+        }
+    }
+}
+
 #[derive(Clone, Serialize, Deserialize)]
-pub struct TermHistory(pub Vec<TermSwitchEntry>);
+pub struct TermHistory(pub Vec<TermLsn>);

 impl TermHistory {
    pub fn empty() -> TermHistory {
        TermHistory(Vec::new())
    }

-    // Parse TermHistory as n_entries followed by TermSwitchEntry pairs
+    // Parse TermHistory as n_entries followed by TermLsn pairs
    pub fn from_bytes(bytes: &mut Bytes) -> Result<TermHistory> {
        if bytes.remaining() < 4 {
            bail!("TermHistory misses len");
@@ -60,7 +71,7 @@ impl TermHistory {
            if bytes.remaining() < 16 {
                bail!("TermHistory is incomplete");
            }
-            res.push(TermSwitchEntry {
+            res.push(TermLsn {
                term: bytes.get_u64_le(),
                lsn: bytes.get_u64_le().into(),
            })
@@ -557,12 +568,17 @@ where
            .up_to(self.flush_lsn())
    }

+    /// Get current term.
+    pub fn get_term(&self) -> Term {
+        self.state.acceptor_state.term
+    }
+
    pub fn get_epoch(&self) -> Term {
        self.state.acceptor_state.get_epoch(self.flush_lsn())
    }

    /// wal_store wrapper avoiding commit_lsn <= flush_lsn violation when we don't have WAL yet.
-    fn flush_lsn(&self) -> Lsn {
+    pub fn flush_lsn(&self) -> Lsn {
        max(self.wal_store.flush_lsn(), self.state.timeline_start_lsn)
    }

@@ -1138,7 +1154,7 @@ mod tests {
        let pem = ProposerElected {
            term: 1,
            start_streaming_at: Lsn(1),
-            term_history: TermHistory(vec![TermSwitchEntry {
+            term_history: TermHistory(vec![TermLsn {
                term: 1,
                lsn: Lsn(3),
            }]),
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -2,12 +2,12 @@
 //! with the "START_REPLICATION" message, and registry of walsenders.

 use crate::handler::SafekeeperPostgresHandler;
-use crate::safekeeper::Term;
+use crate::safekeeper::{Term, TermLsn};
 use crate::timeline::Timeline;
 use crate::wal_service::ConnectionId;
 use crate::wal_storage::WalReader;
 use crate::GlobalTimelines;
-use anyhow::Context as AnyhowContext;
+use anyhow::{bail, Context as AnyhowContext};
 use bytes::Bytes;
 use parking_lot::Mutex;
 use postgres_backend::PostgresBackend;
@@ -390,26 +390,25 @@ impl SafekeeperPostgresHandler {
            self.appname.clone(),
        ));

-        let commit_lsn_watch_rx = tli.get_commit_lsn_watch_rx();
-
-        // Walproposer gets special handling: safekeeper must give proposer all
-        // local WAL till the end, whether committed or not (walproposer will
-        // hang otherwise). That's because walproposer runs the consensus and
-        // synchronizes safekeepers on the most advanced one.
+        // Walsender can operate in one of two modes which we select by
+        // application_name: give only committed WAL (used by pageserver) or all
+        // existing WAL (up to flush_lsn, used by walproposer or peer recovery).
+        // The second case is always driven by a consensus leader which term
+        // must generally be also supplied. However we're sloppy to do this in
+        // walproposer recovery which will be removed soon. So TODO is to make
+        // it not Option'al then.
        //
-        // There is a small risk of this WAL getting concurrently garbaged if
-        // another compute rises which collects majority and starts fixing log
-        // on this safekeeper itself. That's ok as (old) proposer will never be
-        // able to commit such WAL.
-        let stop_pos: Option<Lsn> = if self.is_walproposer_recovery() {
-            let wal_end = tli.get_flush_lsn().await;
-            Some(wal_end)
+        // Fetching WAL without term in recovery creates a small risk of this
+        // WAL getting concurrently garbaged if another compute rises which
+        // collects majority and starts fixing log on this safekeeper itself.
+        // That's ok as (old) proposer will never be able to commit such WAL.
+        let end_watch = if self.is_walproposer_recovery() {
+            EndWatch::Flush(tli.get_term_flush_lsn_watch_rx())
        } else {
-            None
+            EndWatch::Commit(tli.get_commit_lsn_watch_rx())
        };
-
-        // take the latest commit_lsn if don't have stop_pos
-        let end_pos = stop_pos.unwrap_or(*commit_lsn_watch_rx.borrow());
+        // we don't check term here; it will be checked on first waiting/WAL reading anyway.
+        let end_pos = end_watch.get();

        if end_pos < start_pos {
            warn!(
@@ -419,8 +418,10 @@ impl SafekeeperPostgresHandler {
        }

        info!(
-            "starting streaming from {:?} till {:?}, available WAL ends at {}",
-            start_pos, stop_pos, end_pos
+            "starting streaming from {:?}, available WAL ends at {}, recovery={}",
+            start_pos,
+            end_pos,
+            matches!(end_watch, EndWatch::Flush(_))
        );

        // switch to copy
@@ -445,9 +446,8 @@ impl SafekeeperPostgresHandler {
            appname,
            start_pos,
            end_pos,
-            stop_pos,
            term,
-            commit_lsn_watch_rx,
+            end_watch,
            ws_guard: ws_guard.clone(),
            wal_reader,
            send_buf: [0; MAX_SEND_SIZE],
@@ -466,6 +466,32 @@ impl SafekeeperPostgresHandler {
    }
 }

+/// Walsender streams either up to commit_lsn (normally) or flush_lsn in the
+/// given term (recovery by walproposer or peer safekeeper).
+enum EndWatch {
+    Commit(Receiver<Lsn>),
+    Flush(Receiver<TermLsn>),
+}
+
+impl EndWatch {
+    /// Get current end of WAL.
+    fn get(&self) -> Lsn {
+        match self {
+            EndWatch::Commit(r) => *r.borrow(),
+            EndWatch::Flush(r) => r.borrow().lsn,
+        }
+    }
+
+    /// Wait for the update.
+    async fn changed(&mut self) -> anyhow::Result<()> {
+        match self {
+            EndWatch::Commit(r) => r.changed().await?,
+            EndWatch::Flush(r) => r.changed().await?,
+        }
+        Ok(())
+    }
+}
+
 /// A half driving sending WAL.
 struct WalSender<'a, IO> {
    pgb: &'a mut PostgresBackend<IO>,
@@ -480,14 +506,12 @@ struct WalSender<'a, IO> {
    // We send this LSN to the receiver as wal_end, so that it knows how much
    // WAL this safekeeper has. This LSN should be as fresh as possible.
    end_pos: Lsn,
-    // If present, terminate after reaching this position; used by walproposer
-    // in recovery.
-    stop_pos: Option<Lsn>,
    /// When streaming uncommitted part, the term the client acts as the leader
    /// in. Streaming is stopped if local term changes to a different (higher)
    /// value.
    term: Option<Term>,
-    commit_lsn_watch_rx: Receiver<Lsn>,
+    /// Watch channel receiver to learn end of available WAL (and wait for its advancement).
+    end_watch: EndWatch,
    ws_guard: Arc<WalSenderGuard>,
    wal_reader: WalReader,
    // buffer for readling WAL into to send it
@@ -497,29 +521,20 @@ struct WalSender<'a, IO> {
 impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
    /// Send WAL until
    /// - an error occurs
-    /// - if we are streaming to walproposer, we've streamed until stop_pos
-    ///   (recovery finished)
-    /// - receiver is caughtup and there is no computes
+    /// - receiver is caughtup and there is no computes (if streaming up to commit_lsn)
    ///
    /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
    /// convenience.
    async fn run(&mut self) -> Result<(), CopyStreamHandlerEnd> {
        loop {
-            // If we are streaming to walproposer, check it is time to stop.
-            if let Some(stop_pos) = self.stop_pos {
-                if self.start_pos >= stop_pos {
-                    // recovery finished
-                    return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
-                        "ending streaming to walproposer at {}, recovery finished",
-                        self.start_pos
-                    )));
-                }
-            } else {
-                // Wait for the next portion if it is not there yet, or just
-                // update our end of WAL available for sending value, we
-                // communicate it to the receiver.
-                self.wait_wal().await?;
-            }
+            // Wait for the next portion if it is not there yet, or just
+            // update our end of WAL available for sending value, we
+            // communicate it to the receiver.
+            self.wait_wal().await?;
+            assert!(
+                self.end_pos > self.start_pos,
+                "nothing to send after waiting for WAL"
+            );

            // try to send as much as available, capped by MAX_SEND_SIZE
            let mut send_size = self
@@ -567,7 +582,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
    /// exit in the meanwhile
    async fn wait_wal(&mut self) -> Result<(), CopyStreamHandlerEnd> {
        loop {
-            self.end_pos = *self.commit_lsn_watch_rx.borrow();
+            self.end_pos = self.end_watch.get();
            if self.end_pos > self.start_pos {
                // We have something to send.
                trace!("got end_pos {:?}, streaming", self.end_pos);
@@ -575,27 +590,31 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
            }

            // Wait for WAL to appear, now self.end_pos == self.start_pos.
-            if let Some(lsn) = wait_for_lsn(&mut self.commit_lsn_watch_rx, self.start_pos).await? {
+            if let Some(lsn) = wait_for_lsn(&mut self.end_watch, self.term, self.start_pos).await? {
                self.end_pos = lsn;
                trace!("got end_pos {:?}, streaming", self.end_pos);
                return Ok(());
            }

-            // Timed out waiting for WAL, check for termination and send KA
-            if let Some(remote_consistent_lsn) = self
-                .ws_guard
-                .walsenders
-                .get_ws_remote_consistent_lsn(self.ws_guard.id)
-            {
-                if self.tli.should_walsender_stop(remote_consistent_lsn).await {
-                    // Terminate if there is nothing more to send.
-                    // Note that "ending streaming" part of the string is used by
-                    // pageserver to identify WalReceiverError::SuccessfulCompletion,
-                    // do not change this string without updating pageserver.
-                    return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
+            // Timed out waiting for WAL, check for termination and send KA.
+            // Check for termination only if we are streaming up to commit_lsn
+            // (to pageserver).
+            if let EndWatch::Commit(_) = self.end_watch {
+                if let Some(remote_consistent_lsn) = self
+                    .ws_guard
+                    .walsenders
+                    .get_ws_remote_consistent_lsn(self.ws_guard.id)
+                {
+                    if self.tli.should_walsender_stop(remote_consistent_lsn).await {
+                        // Terminate if there is nothing more to send.
+                        // Note that "ending streaming" part of the string is used by
+                        // pageserver to identify WalReceiverError::SuccessfulCompletion,
+                        // do not change this string without updating pageserver.
+                        return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
                        "ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
                        self.appname, self.start_pos,
                    )));
+                    }
                }
            }

@@ -663,22 +682,32 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {

 const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);

-/// Wait until we have commit_lsn > lsn or timeout expires. Returns
-/// - Ok(Some(commit_lsn)) if needed lsn is successfully observed;
+/// Wait until we have available WAL > start_pos or timeout expires. Returns
+/// - Ok(Some(end_pos)) if needed lsn is successfully observed;
 /// - Ok(None) if timeout expired;
-/// - Err in case of error (if watch channel is in trouble, shouldn't happen).
-async fn wait_for_lsn(rx: &mut Receiver<Lsn>, lsn: Lsn) -> anyhow::Result<Option<Lsn>> {
+/// - Err in case of error -- only if 1) term changed while fetching in recovery
+///   mode 2) watch channel closed, which must never happen.
+async fn wait_for_lsn(
+    rx: &mut EndWatch,
+    client_term: Option<Term>,
+    start_pos: Lsn,
+) -> anyhow::Result<Option<Lsn>> {
    let res = timeout(POLL_STATE_TIMEOUT, async move {
-        let mut commit_lsn;
        loop {
-            rx.changed().await?;
-            commit_lsn = *rx.borrow();
-            if commit_lsn > lsn {
-                break;
+            let end_pos = rx.get();
+            if end_pos > start_pos {
+                return Ok(end_pos);
            }
+            if let EndWatch::Flush(rx) = rx {
+                let curr_term = rx.borrow().term;
+                if let Some(client_term) = client_term {
+                    if curr_term != client_term {
+                        bail!("term changed: requested {}, now {}", client_term, curr_term);
+                    }
+                }
+            }
+            rx.changed().await?;
        }
-
-        Ok(commit_lsn)
    })
    .await;

--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -3,8 +3,11 @@

 use anyhow::{anyhow, bail, Result};
 use postgres_ffi::XLogSegNo;
+use serde::{Deserialize, Serialize};
+use serde_with::serde_as;
 use tokio::fs;

+use serde_with::DisplayFromStr;
 use std::cmp::max;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -24,9 +27,10 @@ use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;

 use crate::receive_wal::WalReceivers;
+use crate::recovery::recovery_main;
 use crate::safekeeper::{
    AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState,
-    SafekeeperMemState, ServerInfo, Term,
+    SafekeeperMemState, ServerInfo, Term, TermLsn, INVALID_TERM,
 };
 use crate::send_wal::WalSenders;
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
@@ -37,18 +41,25 @@ use crate::SafeKeeperConf;
 use crate::{debug_dump, wal_storage};

 /// Things safekeeper should know about timeline state on peers.
-#[derive(Debug, Clone)]
+#[serde_as]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct PeerInfo {
    pub sk_id: NodeId,
    /// Term of the last entry.
    _last_log_term: Term,
    /// LSN of the last record.
+    #[serde_as(as = "DisplayFromStr")]
    _flush_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
    pub commit_lsn: Lsn,
    /// Since which LSN safekeeper has WAL. TODO: remove this once we fill new
    /// sk since backup_lsn.
+    #[serde_as(as = "DisplayFromStr")]
    pub local_start_lsn: Lsn,
-    /// When info was received.
+    /// When info was received. Serde annotations are not very useful but make
+    /// the code compile -- we don't rely on this field externally.
+    #[serde(skip)]
+    #[serde(default = "Instant::now")]
    ts: Instant,
 }

@@ -237,8 +248,9 @@ impl SharedState {
                tenant_id: ttid.tenant_id.as_ref().to_owned(),
                timeline_id: ttid.timeline_id.as_ref().to_owned(),
            }),
+            term: self.sk.state.acceptor_state.term,
            last_log_term: self.sk.get_epoch(),
-            flush_lsn: self.sk.wal_store.flush_lsn().0,
+            flush_lsn: self.sk.flush_lsn().0,
            // note: this value is not flushed to control file yet and can be lost
            commit_lsn: self.sk.inmem.commit_lsn.0,
            remote_consistent_lsn: remote_consistent_lsn.0,
@@ -247,6 +259,7 @@ impl SharedState {
                .advertise_pg_addr
                .to_owned()
                .unwrap_or(conf.listen_pg_addr.clone()),
+            http_connstr: conf.listen_http_addr.to_owned(),
            backup_lsn: self.sk.inmem.backup_lsn.0,
            local_start_lsn: self.sk.state.local_start_lsn.0,
            availability_zone: conf.availability_zone.clone(),
@@ -296,6 +309,13 @@ pub struct Timeline {
    commit_lsn_watch_tx: watch::Sender<Lsn>,
    commit_lsn_watch_rx: watch::Receiver<Lsn>,

+    /// Broadcasts (current term, flush_lsn) updates, walsender is interested in
+    /// them when sending in recovery mode (to walproposer or peers). Note: this
+    /// is just a notification, WAL reading should always done with lock held as
+    /// term can change otherwise.
+    term_flush_lsn_watch_tx: watch::Sender<TermLsn>,
+    term_flush_lsn_watch_rx: watch::Receiver<TermLsn>,
+
    /// Safekeeper and other state, that should remain consistent and
    /// synchronized with the disk. This is tokio mutex as we write WAL to disk
    /// while holding it, ensuring that consensus checks are in order.
@@ -317,16 +337,20 @@ pub struct Timeline {
 impl Timeline {
    /// Load existing timeline from disk.
    pub fn load_timeline(
-        conf: SafeKeeperConf,
+        conf: &SafeKeeperConf,
        ttid: TenantTimelineId,
        wal_backup_launcher_tx: Sender<TenantTimelineId>,
    ) -> Result<Timeline> {
        let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();

-        let shared_state = SharedState::restore(&conf, &ttid)?;
+        let shared_state = SharedState::restore(conf, &ttid)?;
        let rcl = shared_state.sk.state.remote_consistent_lsn;
        let (commit_lsn_watch_tx, commit_lsn_watch_rx) =
            watch::channel(shared_state.sk.state.commit_lsn);
+        let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) = watch::channel(TermLsn::from((
+            shared_state.sk.get_term(),
+            shared_state.sk.flush_lsn(),
+        )));
        let (cancellation_tx, cancellation_rx) = watch::channel(false);

        Ok(Timeline {
@@ -334,6 +358,8 @@ impl Timeline {
            wal_backup_launcher_tx,
            commit_lsn_watch_tx,
            commit_lsn_watch_rx,
+            term_flush_lsn_watch_tx,
+            term_flush_lsn_watch_rx,
            mutex: Mutex::new(shared_state),
            walsenders: WalSenders::new(rcl),
            walreceivers: WalReceivers::new(),
@@ -345,7 +371,7 @@ impl Timeline {

    /// Create a new timeline, which is not yet persisted to disk.
    pub fn create_empty(
-        conf: SafeKeeperConf,
+        conf: &SafeKeeperConf,
        ttid: TenantTimelineId,
        wal_backup_launcher_tx: Sender<TenantTimelineId>,
        server_info: ServerInfo,
@@ -353,6 +379,8 @@ impl Timeline {
        local_start_lsn: Lsn,
    ) -> Result<Timeline> {
        let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID);
+        let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) =
+            watch::channel(TermLsn::from((INVALID_TERM, Lsn::INVALID)));
        let (cancellation_tx, cancellation_rx) = watch::channel(false);
        let state = SafeKeeperState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);

@@ -361,7 +389,9 @@ impl Timeline {
            wal_backup_launcher_tx,
            commit_lsn_watch_tx,
            commit_lsn_watch_rx,
-            mutex: Mutex::new(SharedState::create_new(&conf, &ttid, state)?),
+            term_flush_lsn_watch_tx,
+            term_flush_lsn_watch_rx,
+            mutex: Mutex::new(SharedState::create_new(conf, &ttid, state)?),
            walsenders: WalSenders::new(Lsn(0)),
            walreceivers: WalReceivers::new(),
            cancellation_rx,
@@ -370,12 +400,16 @@ impl Timeline {
        })
    }

-    /// Initialize fresh timeline on disk and start background tasks. If bootstrap
+    /// Initialize fresh timeline on disk and start background tasks. If init
    /// fails, timeline is cancelled and cannot be used anymore.
    ///
-    /// Bootstrap is transactional, so if it fails, created files will be deleted,
+    /// Init is transactional, so if it fails, created files will be deleted,
    /// and state on disk should remain unchanged.
-    pub async fn bootstrap(&self, shared_state: &mut MutexGuard<'_, SharedState>) -> Result<()> {
+    pub async fn init_new(
+        self: &Arc<Timeline>,
+        shared_state: &mut MutexGuard<'_, SharedState>,
+        conf: &SafeKeeperConf,
+    ) -> Result<()> {
        match fs::metadata(&self.timeline_dir).await {
            Ok(_) => {
                // Timeline directory exists on disk, we should leave state unchanged
@@ -391,7 +425,7 @@ impl Timeline {
        // Create timeline directory.
        fs::create_dir_all(&self.timeline_dir).await?;

-        // Write timeline to disk and TODO: start background tasks.
+        // Write timeline to disk and start background tasks.
        if let Err(e) = shared_state.sk.persist().await {
            // Bootstrap failed, cancel timeline and remove timeline directory.
            self.cancel(shared_state);
@@ -405,12 +439,16 @@ impl Timeline {

            return Err(e);
        }
-
-        // TODO: add more initialization steps here
-        self.update_status(shared_state);
+        self.bootstrap(conf);
        Ok(())
    }

+    /// Bootstrap new or existing timeline starting background stasks.
+    pub fn bootstrap(self: &Arc<Timeline>, conf: &SafeKeeperConf) {
+        // Start recovery task which always runs on the timeline.
+        tokio::spawn(recovery_main(self.clone(), conf.clone()));
+    }
+
    /// Delete timeline from disk completely, by removing timeline directory. Background
    /// timeline activities will stop eventually.
    pub async fn delete_from_disk(
@@ -444,6 +482,16 @@ impl Timeline {
        *self.cancellation_rx.borrow()
    }

+    /// Returns watch channel which gets value when timeline is cancelled. It is
+    /// guaranteed to have not cancelled value observed (errors otherwise).
+    pub fn get_cancellation_rx(&self) -> Result<watch::Receiver<bool>> {
+        let rx = self.cancellation_rx.clone();
+        if *rx.borrow() {
+            bail!(TimelineError::Cancelled(self.ttid));
+        }
+        Ok(rx)
+    }
+
    /// Take a writing mutual exclusive lock on timeline shared_state.
    pub async fn write_shared_state(&self) -> MutexGuard<SharedState> {
        self.mutex.lock().await
@@ -520,6 +568,11 @@ impl Timeline {
        self.commit_lsn_watch_rx.clone()
    }

+    /// Returns term_flush_lsn watch channel.
+    pub fn get_term_flush_lsn_watch_rx(&self) -> watch::Receiver<TermLsn> {
+        self.term_flush_lsn_watch_rx.clone()
+    }
+
    /// Pass arrived message to the safekeeper.
    pub async fn process_msg(
        &self,
@@ -531,6 +584,7 @@ impl Timeline {

        let mut rmsg: Option<AcceptorProposerMessage>;
        let commit_lsn: Lsn;
+        let term_flush_lsn: TermLsn;
        {
            let mut shared_state = self.write_shared_state().await;
            rmsg = shared_state.sk.process_msg(msg).await?;
@@ -544,8 +598,11 @@ impl Timeline {
            }

            commit_lsn = shared_state.sk.inmem.commit_lsn;
+            term_flush_lsn =
+                TermLsn::from((shared_state.sk.get_term(), shared_state.sk.flush_lsn()));
        }
        self.commit_lsn_watch_tx.send(commit_lsn)?;
+        self.term_flush_lsn_watch_tx.send(term_flush_lsn)?;
        Ok(rmsg)
    }

--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -11,7 +11,7 @@ use serde::Serialize;
 use std::collections::HashMap;
 use std::path::PathBuf;
 use std::str::FromStr;
-use std::sync::{Arc, Mutex, MutexGuard};
+use std::sync::{Arc, Mutex};
 use tokio::sync::mpsc::Sender;
 use tracing::*;
 use utils::id::{TenantId, TenantTimelineId, TimelineId};
@@ -71,19 +71,23 @@ pub struct GlobalTimelines;

 impl GlobalTimelines {
    /// Inject dependencies needed for the timeline constructors and load all timelines to memory.
-    pub fn init(
+    pub async fn init(
        conf: SafeKeeperConf,
        wal_backup_launcher_tx: Sender<TenantTimelineId>,
    ) -> Result<()> {
-        let mut state = TIMELINES_STATE.lock().unwrap();
-        assert!(state.wal_backup_launcher_tx.is_none());
-        state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx);
-        state.conf = Some(conf);
+        // clippy isn't smart enough to understand that drop(state) releases the
+        // lock, so use explicit block
+        let tenants_dir = {
+            let mut state = TIMELINES_STATE.lock().unwrap();
+            assert!(state.wal_backup_launcher_tx.is_none());
+            state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx);
+            state.conf = Some(conf);

-        // Iterate through all directories and load tenants for all directories
-        // named as a valid tenant_id.
+            // Iterate through all directories and load tenants for all directories
+            // named as a valid tenant_id.
+            state.get_conf().workdir.clone()
+        };
        let mut tenant_count = 0;
-        let tenants_dir = state.get_conf().workdir.clone();
        for tenants_dir_entry in std::fs::read_dir(&tenants_dir)
            .with_context(|| format!("failed to list tenants dir {}", tenants_dir.display()))?
        {
@@ -93,7 +97,7 @@ impl GlobalTimelines {
                        TenantId::from_str(tenants_dir_entry.file_name().to_str().unwrap_or(""))
                    {
                        tenant_count += 1;
-                        GlobalTimelines::load_tenant_timelines(&mut state, tenant_id)?;
+                        GlobalTimelines::load_tenant_timelines(tenant_id).await?;
                    }
                }
                Err(e) => error!(
@@ -108,7 +112,7 @@ impl GlobalTimelines {
        info!(
            "found {} tenants directories, successfully loaded {} timelines",
            tenant_count,
-            state.timelines.len()
+            TIMELINES_STATE.lock().unwrap().timelines.len()
        );
        Ok(())
    }
@@ -116,17 +120,21 @@ impl GlobalTimelines {
    /// Loads all timelines for the given tenant to memory. Returns fs::read_dir
    /// errors if any.
    ///
-    /// Note: This function (and all reading/loading below) is sync because
-    /// timelines are loaded while holding GlobalTimelinesState lock. Which is
-    /// fine as this is called only from single threaded main runtime on boot,
-    /// but clippy complains anyway, and suppressing that isn't trivial as async
-    /// is the keyword, ha. That only other user is pull_timeline.rs for which
-    /// being blocked is not that bad, and we can do spawn_blocking.
-    fn load_tenant_timelines(
-        state: &mut MutexGuard<'_, GlobalTimelinesState>,
-        tenant_id: TenantId,
-    ) -> Result<()> {
-        let timelines_dir = state.get_conf().tenant_dir(&tenant_id);
+    /// It is async for update_status_notify sake. Since TIMELINES_STATE lock is
+    /// sync and there is no important reason to make it async (it is always
+    /// held for a short while) we just lock and unlock it for each timeline --
+    /// this function is called during init when nothing else is running, so
+    /// this is fine.
+    async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> {
+        let (conf, wal_backup_launcher_tx) = {
+            let state = TIMELINES_STATE.lock().unwrap();
+            (
+                state.get_conf().clone(),
+                state.wal_backup_launcher_tx.as_ref().unwrap().clone(),
+            )
+        };
+
+        let timelines_dir = conf.tenant_dir(&tenant_id);
        for timelines_dir_entry in std::fs::read_dir(&timelines_dir)
            .with_context(|| format!("failed to list timelines dir {}", timelines_dir.display()))?
        {
@@ -136,13 +144,16 @@ impl GlobalTimelines {
                        TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or(""))
                    {
                        let ttid = TenantTimelineId::new(tenant_id, timeline_id);
-                        match Timeline::load_timeline(
-                            state.get_conf().clone(),
-                            ttid,
-                            state.wal_backup_launcher_tx.as_ref().unwrap().clone(),
-                        ) {
+                        match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx.clone()) {
                            Ok(timeline) => {
-                                state.timelines.insert(ttid, Arc::new(timeline));
+                                let tli = Arc::new(timeline);
+                                TIMELINES_STATE
+                                    .lock()
+                                    .unwrap()
+                                    .timelines
+                                    .insert(ttid, tli.clone());
+                                tli.bootstrap(&conf);
+                                tli.update_status_notify().await.unwrap();
                            }
                            // If we can't load a timeline, it's most likely because of a corrupted
                            // directory. We will log an error and won't allow to delete/recreate
@@ -168,18 +179,22 @@ impl GlobalTimelines {
    }

    /// Load timeline from disk to the memory.
-    pub fn load_timeline(ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
+    pub async fn load_timeline(ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
        let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies();

-        match Timeline::load_timeline(conf, ttid, wal_backup_launcher_tx) {
+        match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx) {
            Ok(timeline) => {
                let tli = Arc::new(timeline);
+
                // TODO: prevent concurrent timeline creation/loading
                TIMELINES_STATE
                    .lock()
                    .unwrap()
                    .timelines
                    .insert(ttid, tli.clone());
+
+                tli.bootstrap(&conf);
+
                Ok(tli)
            }
            // If we can't load a timeline, it's bad. Caller will figure it out.
@@ -217,7 +232,7 @@ impl GlobalTimelines {
        info!("creating new timeline {}", ttid);

        let timeline = Arc::new(Timeline::create_empty(
-            conf,
+            &conf,
            ttid,
            wal_backup_launcher_tx,
            server_info,
@@ -240,23 +255,24 @@ impl GlobalTimelines {
            // Write the new timeline to the disk and start background workers.
            // Bootstrap is transactional, so if it fails, the timeline will be deleted,
            // and the state on disk should remain unchanged.
-            if let Err(e) = timeline.bootstrap(&mut shared_state).await {
-                // Note: the most likely reason for bootstrap failure is that the timeline
+            if let Err(e) = timeline.init_new(&mut shared_state, &conf).await {
+                // Note: the most likely reason for init failure is that the timeline
                // directory already exists on disk. This happens when timeline is corrupted
                // and wasn't loaded from disk on startup because of that. We want to preserve
                // the timeline directory in this case, for further inspection.

                // TODO: this is an unusual error, perhaps we should send it to sentry
                // TODO: compute will try to create timeline every second, we should add backoff
-                error!("failed to bootstrap timeline {}: {}", ttid, e);
+                error!("failed to init new timeline {}: {}", ttid, e);

-                // Timeline failed to bootstrap, it cannot be used. Remove it from the map.
+                // Timeline failed to init, it cannot be used. Remove it from the map.
                TIMELINES_STATE.lock().unwrap().timelines.remove(&ttid);
                return Err(e);
            }
            // We are done with bootstrap, release the lock, return the timeline.
            // {} block forces release before .await
        }
+        timeline.update_status_notify().await?;
        timeline.wal_backup_launcher_tx.send(timeline.ttid).await?;
        Ok(timeline)
    }
--- a/scripts/combine_control_files.py
+++ b/scripts/combine_control_files.py
@@ -1,76 +0,0 @@
-#! /usr/bin/env python3
-# Script to generate ext_index.json metadata file
-# that stores content of the control files and location of extension archives
-# for all extensions in extensions subdir.
-import argparse
-import json
-import subprocess
-from pathlib import Path
-
-"""
-# ext_index.json example:
-{
-    "public_extensions": [
-        "anon"
-    ],
-    "library_index": {
-        "anon": "anon",
-        // for more complex extensions like postgis
-        // we might have something like:
-        // address_standardizer: postgis
-        // postgis_tiger: postgis
-    },
-    "extension_data": {
-        "anon": {
-            "control_data": {
-                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
-            },
-            "archive_path": "5648391853/v15/extensions/anon.tar.zst"
-        }
-    }
-}
-"""
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="generate ext_index.json")
-    parser.add_argument("pg_version", type=str, choices=["v14", "v15"], help="pg_version")
-    parser.add_argument("BUILD_TAG", type=str, help="BUILD_TAG for this compute image")
-    parser.add_argument("--public_extensions", type=str, help="list of public extensions")
-    args = parser.parse_args()
-    pg_version = args.pg_version
-    BUILD_TAG = args.BUILD_TAG
-    public_ext_list = args.public_extensions.split(",")
-
-    ext_index = {}
-    library_index = {}
-    EXT_PATH = Path("extensions")
-    for extension in EXT_PATH.iterdir():
-        if extension.is_dir():
-            control_data = {}
-            for control_file in extension.glob("*.control"):
-                if control_file.suffix != ".control":
-                    continue
-                with open(control_file, "r") as f:
-                    control_data[control_file.name] = f.read()
-            ext_index[extension.name] = {
-                "control_data": control_data,
-                "archive_path": f"{BUILD_TAG}/{pg_version}/extensions/{extension.name}.tar.zst",
-            }
-        elif extension.suffix == ".zst":
-            file_list = (
-                str(subprocess.check_output(["tar", "tf", str(extension)]), "utf-8")
-                .strip()
-                .split("\n")
-            )
-            for file in file_list:
-                if file.endswith(".so") and file.startswith("lib/"):
-                    lib_name = file[4:-3]
-                    library_index[lib_name] = extension.name.replace(".tar.zst", "")
-
-    all_data = {
-        "public_extensions": public_ext_list,
-        "library_index": library_index,
-        "extension_data": ext_index,
-    }
-    with open("ext_index.json", "w") as f:
-        json.dump(all_data, f)
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -12,25 +12,26 @@ import psycopg2.extras
 # We call the test "flaky" if it failed at least once on the main branch in the last N=10 days.
 FLAKY_TESTS_QUERY = """
    SELECT
-        DISTINCT parent_suite, suite, test
+        DISTINCT parent_suite, suite, REGEXP_REPLACE(test, '(release|debug)-pg(\\d+)-?', '') as deparametrized_test
    FROM
        (
            SELECT
-                revision,
-                jsonb_array_elements(data -> 'children') -> 'name' as parent_suite,
-                jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'name' as suite,
-                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'name' as test,
-                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'status' as status,
-                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'retriesStatusChange' as retries_status_change,
-                to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp
+                reference,
+                jsonb_array_elements(data -> 'children') ->> 'name' as parent_suite,
+                jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') ->> 'name' as suite,
+                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'name' as test,
+                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'status' as status,
+                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'retriesStatusChange' as retries_status_change,
+                to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' ->> 'start')::bigint / 1000)::date as timestamp
            FROM
                regress_test_results
-            WHERE
-                reference = 'refs/heads/main'
        ) data
    WHERE
        timestamp > CURRENT_DATE - INTERVAL '%s' day
-        AND (status::text IN ('"failed"', '"broken"') OR retries_status_change::boolean)
+        AND (
+            (status IN ('failed', 'broken') AND reference = 'refs/heads/main')
+            OR retries_status_change::boolean
+        )
    ;
 """

@@ -40,6 +41,9 @@ def main(args: argparse.Namespace):
    interval_days = args.days
    output = args.output

+    build_type = args.build_type
+    pg_version = args.pg_version
+
    res: DefaultDict[str, DefaultDict[str, Dict[str, bool]]]
    res = defaultdict(lambda: defaultdict(dict))

@@ -55,8 +59,21 @@ def main(args: argparse.Namespace):
        rows = []

    for row in rows:
-        logging.info(f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}")
-        res[row["parent_suite"]][row["suite"]][row["test"]] = True
+        # We don't want to automatically rerun tests in a performance suite
+        if row["parent_suite"] != "test_runner.regress":
+            continue
+
+        deparametrized_test = row["deparametrized_test"]
+        dash_if_needed = "" if deparametrized_test.endswith("[]") else "-"
+        parametrized_test = deparametrized_test.replace(
+            "[",
+            f"[{build_type}-pg{pg_version}{dash_if_needed}",
+        )
+        res[row["parent_suite"]][row["suite"]][parametrized_test] = True
+
+        logging.info(
+            f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{parametrized_test}"
+        )

    logging.info(f"saving results to {output.name}")
    json.dump(res, output, indent=2)
@@ -77,6 +94,18 @@ if __name__ == "__main__":
        type=int,
        help="how many days to look back for flaky tests (default: 10)",
    )
+    parser.add_argument(
+        "--build-type",
+        required=True,
+        type=str,
+        help="for which build type to create list of flaky tests (debug or release)",
+    )
+    parser.add_argument(
+        "--pg-version",
+        required=True,
+        type=int,
+        help="for which Postgres version to create list of flaky tests (14, 15, etc.)",
+    )
    parser.add_argument(
        "connstr",
        help="connection string to the test results database",
--- a/storage_broker/benches/rps.rs
+++ b/storage_broker/benches/rps.rs
@@ -125,6 +125,7 @@ async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
                    tenant_id: vec![0xFF; 16],
                    timeline_id: tli_from_u64(counter % n_keys),
                }),
+                term: 0,
                last_log_term: 0,
                flush_lsn: counter,
                commit_lsn: 2,
@@ -132,6 +133,7 @@ async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
                remote_consistent_lsn: 4,
                peer_horizon_lsn: 5,
                safekeeper_connstr: "zenith-1-sk-1.local:7676".to_owned(),
+                http_connstr: "zenith-1-sk-1.local:7677".to_owned(),
                local_start_lsn: 0,
                availability_zone: None,
            };
--- a/storage_broker/proto/broker.proto
+++ b/storage_broker/proto/broker.proto
@@ -22,6 +22,8 @@ message SubscribeSafekeeperInfoRequest {
 message SafekeeperTimelineInfo {
    uint64 safekeeper_id = 1;
    TenantTimelineId tenant_timeline_id = 2;
+    // Safekeeper term
+    uint64 term = 12;
    // Term of the last entry.
    uint64 last_log_term = 3;
    // LSN of the last record.
@@ -36,6 +38,8 @@ message SafekeeperTimelineInfo {
    uint64 local_start_lsn = 9;
    // A connection string to use for WAL receiving.
    string safekeeper_connstr = 10;
+    // HTTP endpoint connection string
+    string http_connstr = 13;
    // Availability zone of a safekeeper.
    optional string availability_zone = 11;
 }
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -519,6 +519,7 @@ mod tests {
                tenant_id: vec![0x00; 16],
                timeline_id,
            }),
+            term: 0,
            last_log_term: 0,
            flush_lsn: 1,
            commit_lsn: 2,
@@ -526,6 +527,7 @@ mod tests {
            remote_consistent_lsn: 4,
            peer_horizon_lsn: 5,
            safekeeper_connstr: "neon-1-sk-1.local:7676".to_owned(),
+            http_connstr: "neon-1-sk-1.local:7677".to_owned(),
            local_start_lsn: 0,
            availability_zone: None,
        }
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -233,10 +233,19 @@ if TYPE_CHECKING:

 def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
    response = list_prefix(neon_env_builder, prefix)
-    objects = response.get("Contents")
-    assert (
-        response["KeyCount"] == 0
-    ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
+    keys = response["KeyCount"]
+    objects = response.get("Contents", [])
+
+    if keys != 0 and len(objects) == 0:
+        # this has been seen in one case with mock_s3:
+        # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4938/6000769714/index.html#suites/3556ed71f2d69272a7014df6dcb02317/ca01e4f4d8d9a11f
+        # looking at moto impl, it might be there's a race with common prefix (sub directory) not going away with deletes
+        common_prefixes = response.get("CommonPrefixes", [])
+        log.warn(
+            f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}"
+        )
+
+    assert keys == 0, f"remote dir with prefix {prefix} is not empty after deletion: {objects}"


 def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -369,7 +369,7 @@ def test_download_remote_layers_api(
    filled_current_physical = get_api_current_physical_size()
    log.info(filled_current_physical)
    filled_size = get_resident_physical_size()
-    log.info(filled_size)
+    log.info(f"filled_size: {filled_size}")
    assert filled_current_physical == filled_size, "we don't yet do layer eviction"

    env.pageserver.stop()
@@ -377,7 +377,7 @@ def test_download_remote_layers_api(
    # remove all the layer files
    # XXX only delete some of the layer files, to show that it really just downloads all the layers
    for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"):
-        log.info(f"unlinking layer {layer}")
+        log.info(f"unlinking layer {layer.name}")
        layer.unlink()

    # Shut down safekeepers before starting the pageserver.
@@ -403,7 +403,7 @@ def test_download_remote_layers_api(
        filled_current_physical == get_api_current_physical_size()
    ), "current_physical_size is sum of loaded layer sizes, independent of whether local or remote"
    post_unlink_size = get_resident_physical_size()
-    log.info(post_unlink_size)
+    log.info(f"post_unlink_size: {post_unlink_size}")
    assert (
        post_unlink_size < filled_size
    ), "we just deleted layers and didn't cause anything to re-download them yet"
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -604,6 +604,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
    checkpoint_allowed_to_fail.set()
    env.pageserver.allowed_errors.append(
        ".* ERROR .*Error processing HTTP request: InternalServerError\\(timeline is Stopping"
+        ".* ERROR .*[Cc]ould not flush frozen layer.*"
    )

    # Generous timeout, because currently deletions can get blocked waiting for compaction
--- a/vm-cgconfig.conf
+++ b/vm-cgconfig.conf
@@ -1,12 +0,0 @@
-# Configuration for cgroups in VM compute nodes
-group neon-postgres {
-    perm {
-        admin {
-            uid = vm-informant;
-        }
-        task {
-            gid = users;
-        }
-    }
-    memory {}
-}
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -14,11 +14,14 @@ publish = false
 ### BEGIN HAKARI SECTION
 [dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
+axum = { version = "0.6", features = ["ws"] }
+base64 = { version = "0.21", features = ["alloc"] }
 bytes = { version = "1", features = ["serde"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] }
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
+digest = { version = "0.10", features = ["mac", "std"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 futures = { version = "0.3" }
@@ -27,6 +30,7 @@ futures-core = { version = "0.3" }
 futures-executor = { version = "0.3" }
 futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
+hyper = { version = "0.14", features = ["full"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
@@ -41,20 +45,20 @@ regex = { version = "1" }
 regex-syntax = { version = "0.7" }
 reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "multipart", "rustls-tls"] }
 ring = { version = "0.16", features = ["std"] }
-rustls = { version = "0.20", features = ["dangerous_configuration"] }
+rustls = { version = "0.21", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["raw_value"] }
+smallvec = { version = "1", default-features = false, features = ["write"] }
 socket2 = { version = "0.4", default-features = false, features = ["all"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
-tokio-rustls = { version = "0.23" }
+tokio-rustls = { version = "0.24" }
 tokio-util = { version = "0.7", features = ["codec", "io"] }
 toml_datetime = { version = "0.6", default-features = false, features = ["serde"] }
 toml_edit = { version = "0.19", features = ["serde"] }
 tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
-tracing-subscriber = { version = "0.3", default-features = false, features = ["env-filter", "fmt", "json", "smallvec", "tracing-log"] }
 url = { version = "2", features = ["serde"] }

 [build-dependencies]
@@ -71,7 +75,7 @@ prost = { version = "0.11" }
 regex = { version = "1" }
 regex-syntax = { version = "0.7" }
 serde = { version = "1", features = ["alloc", "derive"] }
-syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit", "visit-mut"] }
-syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit-mut"] }
+syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] }
+syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit", "visit-mut"] }

 ### END HAKARI SECTION