backup download_all_layers.py script

backing up the exact tokio-epoll-uring version used in the earlier (since reverted) integration commit
commit dde7c280e77dbb867d2fd459d629da2fd7b0edc6 (HEAD -> problame/wip-2023-10-17, origin/problame/wip-2023-10-17) Author: Christian Schwarz <me@cschwarz.com> Date: Tue Oct 17 10:09:48 2023 +0000 no info! logging (not sure this matters, tracing showed up in perf when integrating this branch into neon.git) The integration commit in this branch was: commit 61fac1ab0b Author: Christian Schwarz <me@cschwarz.com> Date: Tue Aug 29 19:13:38 2023 +0000 CP: use hacked-together open_at for async VirtualFile open calls instead of spawn_blocking
2026-05-30 11:30:37 +00:00 · 2023-10-26 08:14:08 +00:00 · 2023-10-17 10:12:22 +00:00 · 2023-10-10 18:16:58 +00:00 · 2023-10-10 17:55:54 +00:00 · 2023-10-10 17:52:32 +00:00
194 changed files with 9242 additions and 19344 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -5,6 +5,4 @@ self-hosted-runner:
    - small
    - us-east-2
 config-variables:
-  - REMOTE_STORAGE_AZURE_CONTAINER
-  - REMOTE_STORAGE_AZURE_REGION
  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -203,10 +203,6 @@ runs:
        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }}
      run: |
-        if [ ! -d "${WORKDIR}/report/data/test-cases" ]; then
-          exit 0
-        fi
-
        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW}

        ./scripts/pysync
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -320,9 +320,6 @@ jobs:
      - name: Build neon extensions
        run: mold -run make neon-pg-ext -j$(nproc)

-      - name: Build walproposer-lib
-        run: mold -run make walproposer-lib -j$(nproc)
-
      - name: Run cargo build
        run: |
          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
@@ -338,16 +335,6 @@ jobs:
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3

-          # Run separate tests for real Azure Blob Storage
-          # XXX: replace region with `eu-central-1`-like region
-          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
-          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
-          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
-          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
-          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
-          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
-
      - name: Install rust binaries
        run: |
          # Install target binaries
@@ -433,7 +420,7 @@ jobs:
          rerun_flaky: true
          pg_version: ${{ matrix.pg_version }}
        env:
-          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty

      - name: Merge and upload coverage data
@@ -468,7 +455,7 @@ jobs:
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
+          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

@@ -847,7 +834,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.18.2
+      VM_BUILDER_VERSION: v0.17.12

    steps:
      - name: Checkout
@@ -1105,10 +1092,8 @@ jobs:
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
-
-            # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -32,7 +32,7 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
        with:
          submodules: true
          fetch-depth: 1
@@ -90,21 +90,18 @@ jobs:

      - name: Build postgres v14
        if: steps.cache_pg_14.outputs.cache-hit != 'true'
-        run: make postgres-v14 -j$(sysctl -n hw.ncpu)
+        run: make postgres-v14 -j$(nproc)

      - name: Build postgres v15
        if: steps.cache_pg_15.outputs.cache-hit != 'true'
-        run: make postgres-v15 -j$(sysctl -n hw.ncpu)
+        run: make postgres-v15 -j$(nproc)

      - name: Build postgres v16
        if: steps.cache_pg_16.outputs.cache-hit != 'true'
-        run: make postgres-v16 -j$(sysctl -n hw.ncpu)
+        run: make postgres-v16 -j$(nproc)

      - name: Build neon extensions
-        run: make neon-pg-ext -j$(sysctl -n hw.ncpu)
-
-      - name: Build walproposer-lib
-        run: make walproposer-lib -j$(sysctl -n hw.ncpu)
+        run: make neon-pg-ext -j$(nproc)

      - name: Run cargo build
        run: cargo build --all --release
@@ -129,7 +126,7 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
        with:
          submodules: true
          fetch-depth: 1
@@ -138,9 +135,6 @@ jobs:
      - name: Get postgres headers
        run: make postgres-headers -j$(nproc)

-      - name: Build walproposer-lib
-        run: make walproposer-lib -j$(nproc)
-
      - name: Produce the build stats
        run: cargo build --all --release --timings

--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -26,7 +26,7 @@ members = [
    "libs/tracing-utils",
    "libs/postgres_ffi/wal_craft",
    "libs/vm_monitor",
-    "libs/walproposer",
+    "libs/nostarve_queue",
 ]

 [workspace.package]
@@ -37,10 +37,6 @@ license = "Apache-2.0"
 [workspace.dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
 async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
-azure_core = "0.16"
-azure_identity = "0.16"
-azure_storage = "0.16"
-azure_storage_blobs = "0.16"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
@@ -81,7 +77,6 @@ hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
 hostname = "0.3.1"
-http-types = "2"
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
@@ -161,11 +156,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -186,7 +181,7 @@ tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
 vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
-walproposer = { version = "0.1", path = "./libs/walproposer/" }
+nostarve_queue = { path = "./libs/nostarve_queue" }

 ## Common library dependency
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }
@@ -202,7 +197,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }

 ################# Binary contents sections

--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -224,8 +224,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
-    echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.0.tar.gz -O pgvector.tar.gz && \
+    echo "d8aa3504b215467ca528525a6de12c3f85f9891b091ce0e5864dd8a9b757f77b pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -368,8 +368,8 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar
 FROM build-deps AS plpgsql-check-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
-    echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.4.0.tar.gz -O plpgsql_check.tar.gz && \
+    echo "9ba58387a279b35a3bfa39ee611e5684e6cddb2ba046ddb2c5190b3bd2ca254a plpgsql_check.tar.gz" | sha256sum --check && \
    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
--- a/38
+++ b/38
@@ -62,7 +62,7 @@ all: neon postgres neon-pg-ext
 #
 # The 'postgres_ffi' depends on the Postgres headers.
 .PHONY: neon
-neon: postgres-headers walproposer-lib
+neon: postgres-headers
 	+@echo "Compiling Neon"
 	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)

@@ -168,42 +168,6 @@ neon-pg-ext-clean-%:
 	-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 	-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean

-# Build walproposer as a static library. walproposer source code is located
-# in the pgxn/neon directory.
-# 
-# We also need to include libpgport.a and libpgcommon.a, because walproposer
-# uses some functions from those libraries.
-# 
-# Some object files are removed from libpgport.a and libpgcommon.a because
-# they depend on openssl and other libraries that are not included in our
-# Rust build.
-.PHONY: walproposer-lib
-walproposer-lib: neon-pg-ext-v16
-	+@echo "Compiling walproposer-lib"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile walproposer-lib
-	cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
-	cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
-ifeq ($(UNAME_S),Linux)
-	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgport.a \
-		pg_strong_random.o
-	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
-		pg_crc32c.o \
-		hmac_openssl.o \
-		cryptohash_openssl.o \
-		scram-common.o \
-		md5_common.o \
-		checksum_helper.o
-endif
-
-.PHONY: walproposer-lib-clean
-walproposer-lib-clean:
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config \
-		-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
-
 .PHONY: neon-pg-ext
 neon-pg-ext: \
 	neon-pg-ext-v14 \
--- a/4
+++ b/4
@@ -1,5 +1,5 @@
 Neon
 Copyright 2022 Neon Inc.

-The PostgreSQL submodules in vendor/ are licensed under the PostgreSQL license.
-See vendor/postgres-vX/COPYRIGHT for details.
+The PostgreSQL submodules in vendor/postgres-v14 and vendor/postgres-v15 are licensed under the
+PostgreSQL license. See vendor/postgres-v14/COPYRIGHT and vendor/postgres-v15/COPYRIGHT.
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -252,7 +252,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
                    IF NOT EXISTS (
                        SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
                    THEN
-                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION IN ROLE pg_read_all_data, pg_write_all_data;
+                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN IN ROLE pg_read_all_data, pg_write_all_data;
                        IF array_length(roles, 1) IS NOT NULL THEN
                            EXECUTE format('GRANT neon_superuser TO %s',
                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
@@ -692,11 +692,10 @@ impl ComputeNode {
        // Proceed with post-startup configuration. Note, that order of operations is important.
        let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
        create_neon_superuser(spec, &mut client)?;
-        cleanup_instance(&mut client)?;
        handle_roles(spec, &mut client)?;
        handle_databases(spec, &mut client)?;
        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
-        handle_grants(spec, &mut client, self.connstr.as_str())?;
+        handle_grants(spec, self.connstr.as_str())?;
        handle_extensions(spec, &mut client)?;
        create_availability_check_data(&mut client)?;

@@ -732,11 +731,10 @@ impl ComputeNode {
        // Disable DDL forwarding because control plane already knows about these roles/databases.
        if spec.mode == ComputeMode::Primary {
            client.simple_query("SET neon.forward_ddl = false")?;
-            cleanup_instance(&mut client)?;
            handle_roles(&spec, &mut client)?;
            handle_databases(&spec, &mut client)?;
            handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
-            handle_grants(&spec, &mut client, self.connstr.as_str())?;
+            handle_grants(&spec, self.connstr.as_str())?;
            handle_extensions(&spec, &mut client)?;
        }

--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -1,4 +1,3 @@
-use std::collections::HashMap;
 use std::fmt::Write;
 use std::fs;
 use std::fs::File;
@@ -206,37 +205,22 @@ pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
 }

 /// Build a list of existing Postgres databases
-pub fn get_existing_dbs(client: &mut Client) -> Result<HashMap<String, Database>> {
-    // `pg_database.datconnlimit = -2` means that the database is in the
-    // invalid state. See:
-    //   https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
-    let postgres_dbs: Vec<Database> = client
+pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
+    let postgres_dbs = client
        .query(
-            "SELECT
-                datname AS name,
-                datdba::regrole::text AS owner,
-                NOT datallowconn AS restrict_conn,
-                datconnlimit = - 2 AS invalid
-            FROM
-                pg_catalog.pg_database;",
+            "SELECT datname, datdba::regrole::text as owner
+               FROM pg_catalog.pg_database;",
            &[],
        )?
        .iter()
        .map(|row| Database {
-            name: row.get("name"),
+            name: row.get("datname"),
            owner: row.get("owner"),
-            restrict_conn: row.get("restrict_conn"),
-            invalid: row.get("invalid"),
            options: None,
        })
        .collect();

-    let dbs_map = postgres_dbs
-        .iter()
-        .map(|db| (db.name.clone(), db.clone()))
-        .collect::<HashMap<_, _>>();
-
-    Ok(dbs_map)
+    Ok(postgres_dbs)
 }

 /// Wait for Postgres to become ready to accept connections. It's ready to
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -13,7 +13,7 @@ use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;

 use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse};
-use compute_api::spec::{ComputeSpec, PgIdent, Role};
+use compute_api::spec::{ComputeSpec, Database, PgIdent, Role};

 // Do control plane request and return response if any. In case of error it
 // returns a bool flag indicating whether it makes sense to retry the request
@@ -161,38 +161,6 @@ pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
    Ok(())
 }

-/// Compute could be unexpectedly shut down, for example, during the
-/// database dropping. This leaves the database in the invalid state,
-/// which prevents new db creation with the same name. This function
-/// will clean it up before proceeding with catalog updates. All
-/// possible future cleanup operations may go here too.
-#[instrument(skip_all)]
-pub fn cleanup_instance(client: &mut Client) -> Result<()> {
-    let existing_dbs = get_existing_dbs(client)?;
-
-    for (_, db) in existing_dbs {
-        if db.invalid {
-            // After recent commit in Postgres, interrupted DROP DATABASE
-            // leaves the database in the invalid state. According to the
-            // commit message, the only option for user is to drop it again.
-            // See:
-            //   https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
-            //
-            // Postgres Neon extension is done the way, that db is de-registered
-            // in the control plane metadata only after it is dropped. So there is
-            // a chance that it still thinks that db should exist. This means
-            // that it will be re-created by `handle_databases()`. Yet, it's fine
-            // as user can just repeat drop (in vanilla Postgres they would need
-            // to do the same, btw).
-            let query = format!("DROP DATABASE IF EXISTS {}", db.name.pg_quote());
-            info!("dropping invalid database {}", db.name);
-            client.execute(query.as_str(), &[])?;
-        }
-    }
-
-    Ok(())
-}
-
 /// Given a cluster spec json and open transaction it handles roles creation,
 /// deletion and update.
 #[instrument(skip_all)]
@@ -302,7 +270,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            }
            RoleAction::Create => {
                let mut query: String = format!(
-                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
+                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("role create query: '{}'", &query);
@@ -411,13 +379,13 @@ fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent
 /// which together provide us idempotency.
 #[instrument(skip_all)]
 pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
-    let existing_dbs = get_existing_dbs(client)?;
+    let existing_dbs: Vec<Database> = get_existing_dbs(client)?;

    // Print a list of existing Postgres databases (only in debug mode)
    if span_enabled!(Level::INFO) {
        info!("postgres databases:");
-        for (dbname, db) in &existing_dbs {
-            info!("    {}:{}", dbname, db.owner);
+        for r in &existing_dbs {
+            info!("    {}:{}", r.name, r.owner);
        }
    }

@@ -471,7 +439,8 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                "rename_db" => {
                    let new_name = op.new_name.as_ref().unwrap();

-                    if existing_dbs.get(&op.name).is_some() {
+                    // XXX: with a limited number of roles it is fine, but consider making it a HashMap
+                    if existing_dbs.iter().any(|r| r.name == op.name) {
                        let query: String = format!(
                            "ALTER DATABASE {} RENAME TO {}",
                            op.name.pg_quote(),
@@ -488,12 +457,14 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    }

    // Refresh Postgres databases info to handle possible renames
-    let existing_dbs = get_existing_dbs(client)?;
+    let existing_dbs: Vec<Database> = get_existing_dbs(client)?;

    info!("cluster spec databases:");
    for db in &spec.cluster.databases {
        let name = &db.name;
-        let pg_db = existing_dbs.get(name);
+
+        // XXX: with a limited number of databases it is fine, but consider making it a HashMap
+        let pg_db = existing_dbs.iter().find(|r| r.name == *name);

        enum DatabaseAction {
            None,
@@ -559,32 +530,13 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
 /// to allow users creating trusted extensions and re-creating `public` schema, for example.
 #[instrument(skip_all)]
-pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> Result<()> {
-    info!("modifying database permissions");
-    let existing_dbs = get_existing_dbs(client)?;
+pub fn handle_grants(spec: &ComputeSpec, connstr: &str) -> Result<()> {
+    info!("cluster spec grants:");

    // Do some per-database access adjustments. We'd better do this at db creation time,
    // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
    // atomically.
    for db in &spec.cluster.databases {
-        match existing_dbs.get(&db.name) {
-            Some(pg_db) => {
-                if pg_db.restrict_conn || pg_db.invalid {
-                    info!(
-                        "skipping grants for db {} (invalid: {}, connections not allowed: {})",
-                        db.name, pg_db.invalid, pg_db.restrict_conn
-                    );
-                    continue;
-                }
-            }
-            None => {
-                bail!(
-                    "database {} doesn't exist in Postgres after handle_databases()",
-                    db.name
-                );
-            }
-        }
-
        let mut conf = Config::from_str(connstr)?;
        conf.dbname(&db.name);

@@ -623,11 +575,6 @@ pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) ->

        // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user.
        // This is needed because since postgres 15 this privilege is removed by default.
-        // TODO: web_access isn't created for almost 1 year. It could be that we have
-        // active users of 1 year old projects, but hopefully not, so check it and
-        // remove this code if possible. The worst thing that could happen is that
-        // user won't be able to use public schema in NEW databases created in the
-        // very OLD project.
        let grant_query = "DO $$\n\
                BEGIN\n\
                    IF EXISTS(\n\
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -28,7 +28,7 @@ mod pg_helpers_tests {
        assert_eq!(
            spec.cluster.settings.as_pg_settings(),
            r#"fsync = off
-wal_level = logical
+wal_level = replica
 hot_standby = on
 neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'
 wal_log_hints = on
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -36,7 +36,7 @@ use utils::pid_file::{self, PidFileRead};
 // it's waiting. If the process hasn't started/stopped after 5 seconds,
 // it prints a notice that it's taking long, but keeps waiting.
 //
-const RETRY_UNTIL_SECS: u64 = 10;
+const RETRY_UNTIL_SECS: u64 = 40;
 const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS;
 const RETRY_INTERVAL_MILLIS: u64 = 100;
 const DOT_EVERY_RETRIES: u64 = 10;
@@ -86,7 +86,7 @@ where
        .stdout(process_log_file)
        .stderr(same_file_for_stderr)
        .args(args);
-    let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
+    let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command));
    filled_cmd.envs(envs);

    let pid_file_to_check = match initial_pid_file {
@@ -238,13 +238,11 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
    filled_cmd
 }

-fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
+fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
    for env_key in [
        "AWS_ACCESS_KEY_ID",
        "AWS_SECRET_ACCESS_KEY",
        "AWS_SESSION_TOKEN",
-        "AZURE_STORAGE_ACCOUNT",
-        "AZURE_STORAGE_ACCESS_KEY",
    ] {
        if let Ok(value) = std::env::var(env_key) {
            cmd = cmd.env(env_key, value);
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -13,7 +13,6 @@ use serde::{Deserialize, Serialize};
 use std::path::{Path, PathBuf};
 use std::{collections::HashMap, sync::Arc};
 use utils::logging::{self, LogFormat};
-use utils::signals::{ShutdownSignals, Signal};

 use utils::{
    http::{
@@ -221,21 +220,8 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
            generation: 0,
        });

-    if let Some(attaching_pageserver) = attach_req.pageserver_id.as_ref() {
+    if attach_req.pageserver_id.is_some() {
        tenant_state.generation += 1;
-        tracing::info!(
-            "attach_hook: issuing generation {} to pageserver {}",
-            attaching_pageserver,
-            tenant_state.generation
-        );
-    } else if let Some(ps_id) = tenant_state.pageserver {
-        tracing::info!(
-            "attach_hook: dropping pageserver {} in generation {}",
-            ps_id,
-            tenant_state.generation
-        );
-    } else {
-        tracing::info!("attach_hook: no-op: tenant already has no pageserver");
    }
    tenant_state.pageserver = attach_req.pageserver_id;
    let generation = tenant_state.generation;
@@ -282,16 +268,7 @@ async fn main() -> anyhow::Result<()> {
    let server = hyper::Server::from_tcp(http_listener)?.serve(service);

    tracing::info!("Serving on {0}", args.listen);
-
-    tokio::task::spawn(server);
-
-    ShutdownSignals::handle(|signal| match signal {
-        Signal::Interrupt | Signal::Terminate | Signal::Quit => {
-            tracing::info!("Got {}. Terminating", signal.name());
-            // We're just a test helper: no graceful shutdown.
-            std::process::exit(0);
-        }
-    })?;
+    server.await?;

    Ok(())
 }
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -116,7 +116,6 @@ fn main() -> Result<()> {
            "attachment_service" => handle_attachment_service(sub_args, &env),
            "safekeeper" => handle_safekeeper(sub_args, &env),
            "endpoint" => handle_endpoint(sub_args, &env),
-            "mappings" => handle_mappings(sub_args, &mut env),
            "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
            _ => bail!("unexpected subcommand {sub_name}"),
        };
@@ -817,38 +816,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
    Ok(())
 }

-fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
-    let (sub_name, sub_args) = match sub_match.subcommand() {
-        Some(ep_subcommand_data) => ep_subcommand_data,
-        None => bail!("no mappings subcommand provided"),
-    };
-
-    match sub_name {
-        "map" => {
-            let branch_name = sub_args
-                .get_one::<String>("branch-name")
-                .expect("branch-name argument missing");
-
-            let tenant_id = sub_args
-                .get_one::<String>("tenant-id")
-                .map(|x| TenantId::from_str(x))
-                .expect("tenant-id argument missing")
-                .expect("malformed tenant-id arg");
-
-            let timeline_id = sub_args
-                .get_one::<String>("timeline-id")
-                .map(|x| TimelineId::from_str(x))
-                .expect("timeline-id argument missing")
-                .expect("malformed timeline-id arg");
-
-            env.register_branch_mapping(branch_name.to_owned(), tenant_id, timeline_id)?;
-
-            Ok(())
-        }
-        other => unimplemented!("mappings subcommand {other}"),
-    }
-}
-
 fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
        let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
@@ -1117,7 +1084,6 @@ fn cli() -> Command {
    // --id, when using a pageserver command
    let pageserver_id_arg = Arg::new("pageserver-id")
        .long("id")
-        .global(true)
        .help("pageserver id")
        .required(false);
    // --pageserver-id when using a non-pageserver command
@@ -1288,20 +1254,17 @@ fn cli() -> Command {
            Command::new("pageserver")
                .arg_required_else_help(true)
                .about("Manage pageserver")
-                .arg(pageserver_id_arg)
                .subcommand(Command::new("status"))
-                .subcommand(Command::new("start")
-                    .about("Start local pageserver")
-                    .arg(pageserver_config_args.clone())
-                )
-                .subcommand(Command::new("stop")
-                    .about("Stop local pageserver")
-                    .arg(stop_mode_arg.clone())
-                )
-                .subcommand(Command::new("restart")
-                    .about("Restart local pageserver")
-                    .arg(pageserver_config_args.clone())
-                )
+                .arg(pageserver_id_arg.clone())
+                .subcommand(Command::new("start").about("Start local pageserver")
+                .arg(pageserver_id_arg.clone())
+                .arg(pageserver_config_args.clone()))
+                .subcommand(Command::new("stop").about("Stop local pageserver")
+                .arg(pageserver_id_arg.clone())
+                            .arg(stop_mode_arg.clone()))
+                .subcommand(Command::new("restart").about("Restart local pageserver")
+                .arg(pageserver_id_arg.clone())
+                .arg(pageserver_config_args.clone()))
        )
        .subcommand(
            Command::new("attachment_service")
@@ -1358,8 +1321,8 @@ fn cli() -> Command {
                    .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
                    .arg(endpoint_id_arg.clone())
                    .arg(tenant_id_arg.clone())
-                    .arg(branch_name_arg.clone())
-                    .arg(timeline_id_arg.clone())
+                    .arg(branch_name_arg)
+                    .arg(timeline_id_arg)
                    .arg(lsn_arg)
                    .arg(pg_port_arg)
                    .arg(http_port_arg)
@@ -1372,7 +1335,7 @@ fn cli() -> Command {
                .subcommand(
                    Command::new("stop")
                    .arg(endpoint_id_arg)
-                    .arg(tenant_id_arg.clone())
+                    .arg(tenant_id_arg)
                    .arg(
                        Arg::new("destroy")
                            .help("Also delete data directory (now optional, should be default in future)")
@@ -1383,18 +1346,6 @@ fn cli() -> Command {
                )

        )
-        .subcommand(
-            Command::new("mappings")
-                .arg_required_else_help(true)
-                .about("Manage neon_local branch name mappings")
-                .subcommand(
-                    Command::new("map")
-                        .about("Create new mapping which cannot exist already")
-                        .arg(branch_name_arg.clone())
-                        .arg(tenant_id_arg.clone())
-                        .arg(timeline_id_arg.clone())
-                )
-        )
        // Obsolete old name for 'endpoint'. We now just print an error if it's used.
        .subcommand(
            Command::new("pg")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -253,7 +253,7 @@ impl Endpoint {
        conf.append("shared_buffers", "1MB");
        conf.append("fsync", "off");
        conf.append("max_connections", "100");
-        conf.append("wal_level", "logical");
+        conf.append("wal_level", "replica");
        // wal_sender_timeout is the maximum time to wait for WAL replication.
        // It also defines how often the walreciever will send a feedback message to the wal sender.
        conf.append("wal_sender_timeout", "5s");
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -18,7 +18,7 @@ use camino::Utf8PathBuf;
 use pageserver_api::models::{self, TenantInfo, TimelineInfo};
 use postgres_backend::AuthType;
 use postgres_connection::{parse_host_port, PgConnectionConfig};
-use reqwest::blocking::{Client, RequestBuilder, Response};
+use reqwest::blocking::{Client, ClientBuilder, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
 use utils::auth::{Claims, Scope};
@@ -93,7 +93,7 @@ impl PageServerNode {
            pg_connection_config: PgConnectionConfig::new_host_port(host, port),
            conf: conf.clone(),
            env: env.clone(),
-            http_client: Client::new(),
+            http_client: ClientBuilder::new().timeout(None).build().unwrap(),
            http_base_url: format!("http://{}/v1", conf.listen_http_addr),
        }
    }
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
@@ -25,7 +25,7 @@
            },
            {
                "name": "wal_level",
-                "value": "logical",
+                "value": "replica",
                "vartype": "enum"
            },
            {
--- a/docs/error-handling.md
+++ b/docs/error-handling.md
@@ -188,60 +188,11 @@ that.

 ## Error message style

-### PostgreSQL extensions
-
 PostgreSQL has a style guide for writing error messages:

 https://www.postgresql.org/docs/current/error-style-guide.html

 Follow that guide when writing error messages in the PostgreSQL
-extensions.
-
-### Neon Rust code
-
-#### Anyhow Context
-
-When adding anyhow `context()`, use form `present-tense-verb+action`.
-
-Example:
- Bad: `file.metadata().context("could not get file metadata")?;`
- Good: `file.metadata().context("get file metadata")?;`
-
-#### Logging Errors
-
-When logging any error `e`, use `could not {e:#}` or `failed to {e:#}`.
-
-If `e` is an `anyhow` error and you want to log the backtrace that it contains,
-use `{e:?}` instead of `{e:#}`.
-
-#### Rationale
-
-The `{:#}` ("alternate Display") of an `anyhow` error chain is concatenation fo the contexts, using `: `.
-
-For example, the following Rust code will result in output
-```
-ERROR  failed to list users: load users from server: parse response: invalid json
-```
-
-This is more concise / less noisy than what happens if you do `.context("could not ...")?` at each level, i.e.:
-
-```
-ERROR  could not list users: could not load users from server: could not parse response: invalid json
-```
-
-
-```rust
-fn main() {
-  match list_users().context("list users") else {
-    Ok(_) => ...,
-    Err(e) => tracing::error!("failed to {e:#}"),
-  }
-}
-fn list_users() {
-  http_get_users().context("load users from server")?;
-}
-fn http_get_users() {
-  let response = client....?;
-  response.parse().context("parse response")?; // fails with serde error "invalid json"
-}
-```
+extension. We don't follow it strictly in the pageserver and
+safekeeper, but the advice in the PostgreSQL style guide is generally
+good, and you can't go wrong by following it.
--- a/docs/pageserver-services.md
+++ b/docs/pageserver-services.md
@@ -96,16 +96,6 @@ prefix_in_bucket = '/test_prefix/'

 `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` env variables can be used to specify the S3 credentials if needed.

-or
-
-```toml
-[remote_storage]
-container_name = 'some-container-name'
-container_region = 'us-east'
-prefix_in_container = '/test-prefix/'
-```
-
-`AZURE_STORAGE_ACCOUNT` and `AZURE_STORAGE_ACCESS_KEY` env variables can be used to specify the azure credentials if needed.

 ## Repository background tasks

--- a/download_all_layers.py
+++ b/download_all_layers.py
@@ -0,0 +1,20 @@
+import requests
+
+tenants = requests.get("http://localhost:15003/v1/tenant")
+tenants.raise_for_status()
+tenants = tenants.json()
+
+for tenant in tenants:
+    id = tenant["id"]
+    timelines = requests.get(f"http://localhost:15003/v1/tenant/{id}/timeline")
+    timelines.raise_for_status()
+    for timeline in timelines.json():
+        tid = timeline["tenant_id"]
+        tlid = timeline["timeline_id"]
+        layers = requests.get(f"http://localhost:15003/v1/tenant/{tid}/timeline/{tlid}/layer")
+        layers.raise_for_status()
+        layers = layers.json()
+        for l in layers["historic_layers"]:
+            if l["remote"] == False:
+                requests.get(f"http://localhost:15003/v1/tenant/{tid}/timeline/{tlid}/layer/{l['layer_file_name']}")
+
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -200,12 +200,6 @@ pub struct Database {
    pub name: PgIdent,
    pub owner: PgIdent,
    pub options: GenericOptions,
-    // These are derived flags, not present in the spec file.
-    // They are never set by the control plane.
-    #[serde(skip_deserializing, default)]
-    pub restrict_conn: bool,
-    #[serde(skip_deserializing, default)]
-    pub invalid: bool,
 }

 /// Common type representing both SQL statement params with or without value,
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
@@ -76,7 +76,7 @@
            },
            {
                "name": "wal_level",
-                "value": "logical",
+                "value": "replica",
                "vartype": "enum"
            },
            {
--- a/libs/metrics/src/wrappers.rs
+++ b/libs/metrics/src/wrappers.rs
@@ -1,6 +1,6 @@
 use std::io::{Read, Result, Write};

-/// A wrapper for an object implementing [Read]
+/// A wrapper for an object implementing [Read](std::io::Read)
 /// which allows a closure to observe the amount of bytes read.
 /// This is useful in conjunction with metrics (e.g. [IntCounter](crate::IntCounter)).
 ///
@@ -51,17 +51,17 @@ impl<'a, T> CountedReader<'a, T> {
        }
    }

-    /// Get an immutable reference to the underlying [Read] implementor
+    /// Get an immutable reference to the underlying [Read](std::io::Read) implementor
    pub fn inner(&self) -> &T {
        &self.reader
    }

-    /// Get a mutable reference to the underlying [Read] implementor
+    /// Get a mutable reference to the underlying [Read](std::io::Read) implementor
    pub fn inner_mut(&mut self) -> &mut T {
        &mut self.reader
    }

-    /// Consume the wrapper and return the underlying [Read] implementor
+    /// Consume the wrapper and return the underlying [Read](std::io::Read) implementor
    pub fn into_inner(self) -> T {
        self.reader
    }
@@ -75,7 +75,7 @@ impl<T: Read> Read for CountedReader<'_, T> {
    }
 }

-/// A wrapper for an object implementing [Write]
+/// A wrapper for an object implementing [Write](std::io::Write)
 /// which allows a closure to observe the amount of bytes written.
 /// This is useful in conjunction with metrics (e.g. [IntCounter](crate::IntCounter)).
 ///
@@ -122,17 +122,17 @@ impl<'a, T> CountedWriter<'a, T> {
        }
    }

-    /// Get an immutable reference to the underlying [Write] implementor
+    /// Get an immutable reference to the underlying [Write](std::io::Write) implementor
    pub fn inner(&self) -> &T {
        &self.writer
    }

-    /// Get a mutable reference to the underlying [Write] implementor
+    /// Get a mutable reference to the underlying [Write](std::io::Write) implementor
    pub fn inner_mut(&mut self) -> &mut T {
        &mut self.writer
    }

-    /// Consume the wrapper and return the underlying [Write] implementor
+    /// Consume the wrapper and return the underlying [Write](std::io::Write) implementor
    pub fn into_inner(self) -> T {
        self.writer
    }
--- a/libs/nostarve_queue/Cargo.toml
+++ b/libs/nostarve_queue/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "nostarve_queue"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+scopeguard.workspace = true
+tracing.workspace = true
+
+[dev-dependencies]
+futures.workspace = true
+rand.workspace = true
+tokio = { workspace = true, features = ["rt", "rt-multi-thread", "time"] }
--- a/libs/nostarve_queue/src/lib.rs
+++ b/libs/nostarve_queue/src/lib.rs
@@ -0,0 +1,316 @@
+//! Synchronization primitive to prevent starvation among concurrent tasks that do the same work.
+
+use std::{
+    collections::VecDeque,
+    fmt,
+    future::poll_fn,
+    sync::Mutex,
+    task::{Poll, Waker},
+};
+
+pub struct Queue<T> {
+    inner: Mutex<Inner<T>>,
+}
+
+struct Inner<T> {
+    waiters: VecDeque<usize>,
+    free: VecDeque<usize>,
+    slots: Vec<Option<(Option<Waker>, Option<T>)>>,
+}
+
+#[derive(Clone, Copy)]
+pub struct Position<'q, T> {
+    idx: usize,
+    queue: &'q Queue<T>,
+}
+
+impl<T> fmt::Debug for Position<'_, T> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("Position").field("idx", &self.idx).finish()
+    }
+}
+
+impl<T> Inner<T> {
+    #[cfg(not(test))]
+    #[inline]
+    fn integrity_check(&self) {}
+
+    #[cfg(test)]
+    fn integrity_check(&self) {
+        use std::collections::HashSet;
+        let waiters = self.waiters.iter().copied().collect::<HashSet<_>>();
+        let free = self.free.iter().copied().collect::<HashSet<_>>();
+        for (slot_idx, slot) in self.slots.iter().enumerate() {
+            match slot {
+                None => {
+                    assert!(!waiters.contains(&slot_idx));
+                    assert!(free.contains(&slot_idx));
+                }
+                Some((None, None)) => {
+                    assert!(waiters.contains(&slot_idx));
+                    assert!(!free.contains(&slot_idx));
+                }
+                Some((Some(_), Some(_))) => {
+                    assert!(!waiters.contains(&slot_idx));
+                    assert!(!free.contains(&slot_idx));
+                }
+                Some((Some(_), None)) => {
+                    assert!(waiters.contains(&slot_idx));
+                    assert!(!free.contains(&slot_idx));
+                }
+                Some((None, Some(_))) => {
+                    assert!(!waiters.contains(&slot_idx));
+                    assert!(!free.contains(&slot_idx));
+                }
+            }
+        }
+    }
+}
+
+impl<T> Queue<T> {
+    pub fn new(size: usize) -> Self {
+        Queue {
+            inner: Mutex::new(Inner {
+                waiters: VecDeque::new(),
+                free: (0..size).collect(),
+                slots: {
+                    let mut v = Vec::with_capacity(size);
+                    v.resize_with(size, || None);
+                    v
+                },
+            }),
+        }
+    }
+    pub fn begin(&self) -> Result<Position<T>, ()> {
+        #[cfg(test)]
+        tracing::trace!("get in line locking inner");
+        let mut inner = self.inner.lock().unwrap();
+        inner.integrity_check();
+        let my_waitslot_idx = inner
+            .free
+            .pop_front()
+            .expect("can't happen, len(slots) = len(waiters");
+        inner.waiters.push_back(my_waitslot_idx);
+        let prev = inner.slots[my_waitslot_idx].replace((None, None));
+        assert!(prev.is_none());
+        inner.integrity_check();
+        Ok(Position {
+            idx: my_waitslot_idx,
+            queue: &self,
+        })
+    }
+}
+
+impl<'q, T> Position<'q, T> {
+    pub fn complete_and_wait(self, datum: T) -> impl std::future::Future<Output = T> + 'q {
+        #[cfg(test)]
+        tracing::trace!("found victim locking waiters");
+        let mut inner = self.queue.inner.lock().unwrap();
+        inner.integrity_check();
+        let winner_idx = inner.waiters.pop_front().expect("we put ourselves in");
+        #[cfg(test)]
+        tracing::trace!(winner_idx, "putting victim into next waiters slot");
+        let winner_slot = inner.slots[winner_idx].as_mut().unwrap();
+        let prev = winner_slot.1.replace(datum);
+        assert!(
+            prev.is_none(),
+            "ensure we didn't mess up this simple ring buffer structure"
+        );
+        if let Some(waker) = winner_slot.0.take() {
+            #[cfg(test)]
+            tracing::trace!(winner_idx, "waking up winner");
+            waker.wake()
+        }
+        inner.integrity_check();
+        drop(inner); // the poll_fn locks it again
+
+        let mut poll_num = 0;
+        let mut drop_guard = Some(scopeguard::guard((), |()| {
+            panic!("must not drop this future until Ready");
+        }));
+
+        // take the victim that was found by someone else
+        poll_fn(move |cx| {
+            let my_waitslot_idx = self.idx;
+            poll_num += 1;
+            #[cfg(test)]
+            tracing::trace!(poll_num, "poll_fn locking waiters");
+            let mut inner = self.queue.inner.lock().unwrap();
+            inner.integrity_check();
+            let my_waitslot = inner.slots[self.idx].as_mut().unwrap();
+            // assert!(
+            //     poll_num <= 2,
+            //     "once we place the waker in the slot, next wakeup should have a result: {}",
+            //     my_waitslot.1.is_some()
+            // );
+            if let Some(res) = my_waitslot.1.take() {
+                #[cfg(test)]
+                tracing::trace!(poll_num, "have cache slot");
+                // above .take() resets the waiters slot to None
+                debug_assert!(my_waitslot.0.is_none());
+                debug_assert!(my_waitslot.1.is_none());
+                inner.slots[my_waitslot_idx] = None;
+                inner.free.push_back(my_waitslot_idx);
+                let _ = scopeguard::ScopeGuard::into_inner(drop_guard.take().unwrap());
+                inner.integrity_check();
+                return Poll::Ready(res);
+            }
+            // assert_eq!(poll_num, 1);
+            if !my_waitslot
+                .0
+                .as_ref()
+                .map(|existing| cx.waker().will_wake(existing))
+                .unwrap_or(false)
+            {
+                let prev = my_waitslot.0.replace(cx.waker().clone());
+                #[cfg(test)]
+                tracing::trace!(poll_num, prev_is_some = prev.is_some(), "updating waker");
+            }
+            inner.integrity_check();
+            #[cfg(test)]
+            tracing::trace!(poll_num, "waiting to be woken up");
+            Poll::Pending
+        })
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::{
+        sync::{
+            atomic::{AtomicBool, Ordering},
+            Arc,
+        },
+        task::Poll,
+        time::Duration,
+    };
+
+    use rand::RngCore;
+
+    #[tokio::test]
+    async fn in_order_completion_and_wait() {
+        let queue = super::Queue::new(2);
+
+        let q1 = queue.begin().unwrap();
+        let q2 = queue.begin().unwrap();
+
+        assert_eq!(q1.complete_and_wait(23).await, 23);
+        assert_eq!(q2.complete_and_wait(42).await, 42);
+    }
+
+    #[tokio::test]
+    async fn out_of_order_completion_and_wait() {
+        let queue = super::Queue::new(2);
+
+        let q1 = queue.begin().unwrap();
+        let q2 = queue.begin().unwrap();
+
+        let mut q2compfut = q2.complete_and_wait(23);
+
+        match futures::poll!(&mut q2compfut) {
+            Poll::Pending => {}
+            Poll::Ready(_) => panic!("should not be ready yet, it's queued after q1"),
+        }
+
+        let q1res = q1.complete_and_wait(42).await;
+        assert_eq!(q1res, 23);
+
+        let q2res = q2compfut.await;
+        assert_eq!(q2res, 42);
+    }
+
+    #[tokio::test]
+    async fn in_order_completion_out_of_order_wait() {
+        let queue = super::Queue::new(2);
+
+        let q1 = queue.begin().unwrap();
+        let q2 = queue.begin().unwrap();
+
+        let mut q1compfut = q1.complete_and_wait(23);
+
+        let mut q2compfut = q2.complete_and_wait(42);
+
+        match futures::poll!(&mut q2compfut) {
+            Poll::Pending => {
+                unreachable!("q2 should be ready, it wasn't first but q1 is serviced already")
+            }
+            Poll::Ready(x) => assert_eq!(x, 42),
+        }
+
+        assert_eq!(futures::poll!(&mut q1compfut), Poll::Ready(23));
+    }
+
+    #[tokio::test(flavor = "multi_thread")]
+    async fn stress() {
+        let ntasks = 8;
+        let queue_size = 8;
+        let queue = Arc::new(super::Queue::new(queue_size));
+
+        let stop = Arc::new(AtomicBool::new(false));
+
+        let mut tasks = vec![];
+        for i in 0..ntasks {
+            let jh = tokio::spawn({
+                let queue = Arc::clone(&queue);
+                let stop = Arc::clone(&stop);
+                async move {
+                    while !stop.load(Ordering::Relaxed) {
+                        let q = queue.begin().unwrap();
+                        for _ in 0..(rand::thread_rng().next_u32() % 10_000) {
+                            std::hint::spin_loop();
+                        }
+                        q.complete_and_wait(i).await;
+                        tokio::task::yield_now().await;
+                    }
+                }
+            });
+            tasks.push(jh);
+        }
+
+        tokio::time::sleep(Duration::from_secs(10)).await;
+
+        stop.store(true, Ordering::Relaxed);
+
+        for t in tasks {
+            t.await.unwrap();
+        }
+    }
+
+    #[test]
+    fn stress_two_runtimes_shared_queue() {
+        std::thread::scope(|s| {
+            let ntasks = 8;
+            let queue_size = 8;
+            let queue = Arc::new(super::Queue::new(queue_size));
+
+            let stop = Arc::new(AtomicBool::new(false));
+
+            for i in 0..ntasks {
+                s.spawn({
+                    let queue = Arc::clone(&queue);
+                    let stop = Arc::clone(&stop);
+                    move || {
+                        let rt = tokio::runtime::Builder::new_current_thread()
+                            .enable_all()
+                            .build()
+                            .unwrap();
+                        rt.block_on(async move {
+                            while !stop.load(Ordering::Relaxed) {
+                                let q = queue.begin().unwrap();
+                                for _ in 0..(rand::thread_rng().next_u32() % 10_000) {
+                                    std::hint::spin_loop();
+                                }
+                                q.complete_and_wait(i).await;
+                                tokio::task::yield_now().await;
+                            }
+                        });
+                    }
+                });
+            }
+
+            std::thread::sleep(Duration::from_secs(10));
+
+            stop.store(true, Ordering::Relaxed);
+        });
+    }
+}
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -22,9 +22,9 @@ use postgres_ffi::Oid;
 /// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
 ///
 // FIXME: should move 'forknum' as last field to keep this consistent with Postgres.
-// Then we could replace the custom Ord and PartialOrd implementations below with
-// deriving them. This will require changes in walredoproc.c.
-#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize)]
+// Then we could replace the custo Ord and PartialOrd implementations below with
+// deriving them.
+#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)]
 pub struct RelTag {
    pub forknum: u8,
    pub spcnode: Oid,
@@ -40,9 +40,21 @@ impl PartialOrd for RelTag {

 impl Ord for RelTag {
    fn cmp(&self, other: &Self) -> Ordering {
-        // Custom ordering where we put forknum to the end of the list
-        let other_tup = (other.spcnode, other.dbnode, other.relnode, other.forknum);
-        (self.spcnode, self.dbnode, self.relnode, self.forknum).cmp(&other_tup)
+        let mut cmp = self.spcnode.cmp(&other.spcnode);
+        if cmp != Ordering::Equal {
+            return cmp;
+        }
+        cmp = self.dbnode.cmp(&other.dbnode);
+        if cmp != Ordering::Equal {
+            return cmp;
+        }
+        cmp = self.relnode.cmp(&other.relnode);
+        if cmp != Ordering::Equal {
+            return cmp;
+        }
+        cmp = self.forknum.cmp(&other.forknum);
+
+        cmp
    }
 }

--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -19,8 +19,8 @@ use tracing::{debug, error, info, trace};

 use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
 use pq_proto::{
-    BeMessage, FeMessage, FeStartupPacket, ProtocolError, SQLSTATE_ADMIN_SHUTDOWN,
-    SQLSTATE_INTERNAL_ERROR, SQLSTATE_SUCCESSFUL_COMPLETION,
+    BeMessage, FeMessage, FeStartupPacket, ProtocolError, SQLSTATE_INTERNAL_ERROR,
+    SQLSTATE_SUCCESSFUL_COMPLETION,
 };

 /// An error, occurred during query processing:
@@ -30,9 +30,6 @@ pub enum QueryError {
    /// The connection was lost while processing the query.
    #[error(transparent)]
    Disconnected(#[from] ConnectionError),
-    /// We were instructed to shutdown while processing the query
-    #[error("Shutting down")]
-    Shutdown,
    /// Some other error
    #[error(transparent)]
    Other(#[from] anyhow::Error),
@@ -47,8 +44,7 @@ impl From<io::Error> for QueryError {
 impl QueryError {
    pub fn pg_error_code(&self) -> &'static [u8; 5] {
        match self {
-            Self::Disconnected(_) => b"08006", // connection failure
-            Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
+            Self::Disconnected(_) => b"08006",         // connection failure
            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
        }
    }
@@ -400,20 +396,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        // socket might be already closed, e.g. if previously received error,
        // so ignore result.
        self.framed.shutdown().await.ok();
-        match ret {
-            Ok(()) => Ok(()),
-            Err(QueryError::Shutdown) => {
-                info!("Stopped due to shutdown");
-                Ok(())
-            }
-            Err(QueryError::Disconnected(e)) => {
-                info!("Disconnected ({e:#})");
-                // Disconnection is not an error: we just use it that way internally to drop
-                // out of loops.
-                Ok(())
-            }
-            e => e,
-        }
+        ret
    }

    async fn run_message_loop<F, S>(
@@ -433,11 +416,15 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
            _ = shutdown_watcher() => {
                // We were requested to shut down.
                tracing::info!("shutdown request received during handshake");
-                return Err(QueryError::Shutdown)
+                return Ok(())
            },

-            handshake_r = self.handshake(handler) => {
-                handshake_r?;
+            result = self.handshake(handler) => {
+                // Handshake complete.
+                result?;
+                if self.state == ProtoState::Closed {
+                    return Ok(()); // EOF during handshake
+                }
            }
        );

@@ -448,34 +435,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
            _ = shutdown_watcher() => {
                // We were requested to shut down.
                tracing::info!("shutdown request received in run_message_loop");
-                return Err(QueryError::Shutdown)
+                Ok(None)
            },
            msg = self.read_message() => { msg },
        )? {
            trace!("got message {:?}", msg);

            let result = self.process_message(handler, msg, &mut query_string).await;
-            tokio::select!(
-                biased;
-                _ = shutdown_watcher() => {
-                    // We were requested to shut down.
-                    tracing::info!("shutdown request received during response flush");
-
-                    // If we exited process_message with a shutdown error, there may be
-                    // some valid response content on in our transmit buffer: permit sending
-                    // this within a short timeout.  This is a best effort thing so we don't
-                    // care about the result.
-                    tokio::time::timeout(std::time::Duration::from_millis(500), self.flush()).await.ok();
-
-                    return Err(QueryError::Shutdown)
-                },
-                flush_r = self.flush() => {
-                    flush_r?;
-                }
-            );
-
+            self.flush().await?;
            match result? {
                ProcessMsgResult::Continue => {
+                    self.flush().await?;
                    continue;
                }
                ProcessMsgResult::Break => break,
@@ -580,9 +550,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                        self.peer_addr
                    );
                    self.state = ProtoState::Closed;
-                    return Err(QueryError::Disconnected(ConnectionError::Protocol(
-                        ProtocolError::Protocol("EOF during handshake".to_string()),
-                    )));
+                    return Ok(());
                }
            }
        }
@@ -621,9 +589,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                        self.peer_addr
                    );
                    self.state = ProtoState::Closed;
-                    return Err(QueryError::Disconnected(ConnectionError::Protocol(
-                        ProtocolError::Protocol("EOF during auth".to_string()),
-                    )));
+                    return Ok(());
                }
            }
        }
@@ -947,7 +913,6 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, I
 pub fn short_error(e: &QueryError) -> String {
    match e {
        QueryError::Disconnected(connection_error) => connection_error.to_string(),
-        QueryError::Shutdown => "shutdown".to_string(),
        QueryError::Other(e) => format!("{e:#}"),
    }
 }
@@ -964,9 +929,6 @@ fn log_query_error(query: &str, e: &QueryError) {
        QueryError::Disconnected(other_connection_error) => {
            error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
        }
-        QueryError::Shutdown => {
-            info!("query handler for '{query}' cancelled during tenant shutdown")
-        }
        QueryError::Other(e) => {
            error!("query handler for '{query}' failed: {e:?}");
        }
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -131,7 +131,6 @@ pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;

 // Export some version independent functions that are used outside of this mod
 pub use v14::xlog_utils::encode_logical_message;
-pub use v14::xlog_utils::from_pg_timestamp;
 pub use v14::xlog_utils::get_current_timestamp;
 pub use v14::xlog_utils::to_pg_timestamp;
 pub use v14::xlog_utils::XLogFileName;
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -220,10 +220,6 @@ pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
 pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
 pub const XLP_LONG_HEADER: u16 = 0x0002;

-/* From replication/slot.h */
-pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4  /* offset of `slotdata` in ReplicationSlotOnDisk  */
-   + 64 /* NameData */  + 4*4;
-
 /* From fsm_internals.h */
 const FSM_NODES_PER_PAGE: usize = BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA - 4;
 const FSM_NON_LEAF_NODES_PER_PAGE: usize = BLCKSZ as usize / 2 - 1;
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -136,42 +136,21 @@ pub fn get_current_timestamp() -> TimestampTz {
    to_pg_timestamp(SystemTime::now())
 }

-// Module to reduce the scope of the constants
-mod timestamp_conversions {
-    use std::time::Duration;
-
-    use super::*;
-
-    const UNIX_EPOCH_JDATE: u64 = 2440588; // == date2j(1970, 1, 1)
-    const POSTGRES_EPOCH_JDATE: u64 = 2451545; // == date2j(2000, 1, 1)
+pub fn to_pg_timestamp(time: SystemTime) -> TimestampTz {
+    const UNIX_EPOCH_JDATE: u64 = 2440588; /* == date2j(1970, 1, 1) */
+    const POSTGRES_EPOCH_JDATE: u64 = 2451545; /* == date2j(2000, 1, 1) */
    const SECS_PER_DAY: u64 = 86400;
    const USECS_PER_SEC: u64 = 1000000;
-    const SECS_DIFF_UNIX_TO_POSTGRES_EPOCH: u64 =
-        (POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY;
-
-    pub fn to_pg_timestamp(time: SystemTime) -> TimestampTz {
-        match time.duration_since(SystemTime::UNIX_EPOCH) {
-            Ok(n) => {
-                ((n.as_secs() - SECS_DIFF_UNIX_TO_POSTGRES_EPOCH) * USECS_PER_SEC
-                    + n.subsec_micros() as u64) as i64
-            }
-            Err(_) => panic!("SystemTime before UNIX EPOCH!"),
+    match time.duration_since(SystemTime::UNIX_EPOCH) {
+        Ok(n) => {
+            ((n.as_secs() - ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY))
+                * USECS_PER_SEC
+                + n.subsec_micros() as u64) as i64
        }
-    }
-
-    pub fn from_pg_timestamp(time: TimestampTz) -> SystemTime {
-        let time: u64 = time
-            .try_into()
-            .expect("timestamp before millenium (postgres epoch)");
-        let since_unix_epoch = time + SECS_DIFF_UNIX_TO_POSTGRES_EPOCH * USECS_PER_SEC;
-        SystemTime::UNIX_EPOCH
-            .checked_add(Duration::from_micros(since_unix_epoch))
-            .expect("SystemTime overflow")
+        Err(_) => panic!("SystemTime before UNIX EPOCH!"),
    }
 }

-pub use timestamp_conversions::{from_pg_timestamp, to_pg_timestamp};
-
 // Returns (aligned) end_lsn of the last record in data_dir with WAL segments.
 // start_lsn must point to some previously known record boundary (beginning of
 // the next record). If no valid record after is found, start_lsn is returned
@@ -502,24 +481,4 @@ pub fn encode_logical_message(prefix: &str, message: &str) -> Vec<u8> {
    wal
 }

-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_ts_conversion() {
-        let now = SystemTime::now();
-        let round_trip = from_pg_timestamp(to_pg_timestamp(now));
-
-        let now_since = now.duration_since(SystemTime::UNIX_EPOCH).unwrap();
-        let round_trip_since = round_trip.duration_since(SystemTime::UNIX_EPOCH).unwrap();
-        assert_eq!(now_since.as_micros(), round_trip_since.as_micros());
-
-        let now_pg = get_current_timestamp();
-        let round_trip_pg = to_pg_timestamp(from_pg_timestamp(now_pg));
-
-        assert_eq!(now_pg, round_trip_pg);
-    }
-
-    // If you need to craft WAL and write tests for this module, put it at wal_craft crate.
-}
+// If you need to craft WAL and write tests for this module, put it at wal_craft crate.
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -670,7 +670,6 @@ pub fn read_cstr(buf: &mut Bytes) -> Result<Bytes, ProtocolError> {
 }

 pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000";
-pub const SQLSTATE_ADMIN_SHUTDOWN: &[u8; 5] = b"57P01";
 pub const SQLSTATE_SUCCESSFUL_COMPLETION: &[u8; 5] = b"00000";

 impl<'a> BeMessage<'a> {
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -13,7 +13,6 @@ aws-types.workspace = true
 aws-config.workspace = true
 aws-sdk-s3.workspace = true
 aws-credential-types.workspace = true
-bytes.workspace = true
 camino.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 serde.workspace = true
@@ -27,13 +26,6 @@ metrics.workspace = true
 utils.workspace = true
 pin-project-lite.workspace = true
 workspace_hack.workspace = true
-azure_core.workspace = true
-azure_identity.workspace = true
-azure_storage.workspace = true
-azure_storage_blobs.workspace = true
-futures-util.workspace = true
-http-types.workspace = true
-itertools.workspace = true

 [dev-dependencies]
 camino-tempfile.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -1,356 +0,0 @@
-//! Azure Blob Storage wrapper
-
-use std::env;
-use std::num::NonZeroU32;
-use std::sync::Arc;
-use std::{borrow::Cow, collections::HashMap, io::Cursor};
-
-use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
-use anyhow::Result;
-use azure_core::request_options::{MaxResults, Metadata, Range};
-use azure_core::Header;
-use azure_identity::DefaultAzureCredential;
-use azure_storage::StorageCredentials;
-use azure_storage_blobs::prelude::ClientBuilder;
-use azure_storage_blobs::{
-    blob::operations::GetBlobBuilder,
-    prelude::{BlobClient, ContainerClient},
-};
-use futures_util::StreamExt;
-use http_types::StatusCode;
-use tokio::io::AsyncRead;
-use tracing::debug;
-
-use crate::s3_bucket::RequestKind;
-use crate::{
-    AzureConfig, ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage,
-    StorageMetadata,
-};
-
-pub struct AzureBlobStorage {
-    client: ContainerClient,
-    prefix_in_container: Option<String>,
-    max_keys_per_list_response: Option<NonZeroU32>,
-    concurrency_limiter: ConcurrencyLimiter,
-}
-
-impl AzureBlobStorage {
-    pub fn new(azure_config: &AzureConfig) -> Result<Self> {
-        debug!(
-            "Creating azure remote storage for azure container {}",
-            azure_config.container_name
-        );
-
-        let account = env::var("AZURE_STORAGE_ACCOUNT").expect("missing AZURE_STORAGE_ACCOUNT");
-
-        // If the `AZURE_STORAGE_ACCESS_KEY` env var has an access key, use that,
-        // otherwise try the token based credentials.
-        let credentials = if let Ok(access_key) = env::var("AZURE_STORAGE_ACCESS_KEY") {
-            StorageCredentials::access_key(account.clone(), access_key)
-        } else {
-            let token_credential = DefaultAzureCredential::default();
-            StorageCredentials::token_credential(Arc::new(token_credential))
-        };
-
-        let builder = ClientBuilder::new(account, credentials);
-
-        let client = builder.container_client(azure_config.container_name.to_owned());
-
-        let max_keys_per_list_response =
-            if let Some(limit) = azure_config.max_keys_per_list_response {
-                Some(
-                    NonZeroU32::new(limit as u32)
-                        .ok_or_else(|| anyhow::anyhow!("max_keys_per_list_response can't be 0"))?,
-                )
-            } else {
-                None
-            };
-
-        Ok(AzureBlobStorage {
-            client,
-            prefix_in_container: azure_config.prefix_in_container.to_owned(),
-            max_keys_per_list_response,
-            concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
-        })
-    }
-
-    pub fn relative_path_to_name(&self, path: &RemotePath) -> String {
-        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
-        let path_string = path
-            .get_path()
-            .as_str()
-            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
-        match &self.prefix_in_container {
-            Some(prefix) => {
-                if prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
-                    prefix.clone() + path_string
-                } else {
-                    format!("{prefix}{REMOTE_STORAGE_PREFIX_SEPARATOR}{path_string}")
-                }
-            }
-            None => path_string.to_string(),
-        }
-    }
-
-    fn name_to_relative_path(&self, key: &str) -> RemotePath {
-        let relative_path =
-            match key.strip_prefix(self.prefix_in_container.as_deref().unwrap_or_default()) {
-                Some(stripped) => stripped,
-                // we rely on Azure to return properly prefixed paths
-                // for requests with a certain prefix
-                None => panic!(
-                    "Key {key} does not start with container prefix {:?}",
-                    self.prefix_in_container
-                ),
-            };
-        RemotePath(
-            relative_path
-                .split(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                .collect(),
-        )
-    }
-
-    async fn download_for_builder(
-        &self,
-        metadata: StorageMetadata,
-        builder: GetBlobBuilder,
-    ) -> Result<Download, DownloadError> {
-        let mut response = builder.into_stream();
-
-        // TODO give proper streaming response instead of buffering into RAM
-        // https://github.com/neondatabase/neon/issues/5563
-        let mut buf = Vec::new();
-        while let Some(part) = response.next().await {
-            let part = part.map_err(to_download_error)?;
-            let data = part
-                .data
-                .collect()
-                .await
-                .map_err(|e| DownloadError::Other(e.into()))?;
-            buf.extend_from_slice(&data.slice(..));
-        }
-        Ok(Download {
-            download_stream: Box::pin(Cursor::new(buf)),
-            metadata: Some(metadata),
-        })
-    }
-    // TODO get rid of this function once we have metadata included in the response
-    // https://github.com/Azure/azure-sdk-for-rust/issues/1439
-    async fn get_metadata(
-        &self,
-        blob_client: &BlobClient,
-    ) -> Result<StorageMetadata, DownloadError> {
-        let builder = blob_client.get_metadata();
-
-        let response = builder.into_future().await.map_err(to_download_error)?;
-        let mut map = HashMap::new();
-
-        for md in response.metadata.iter() {
-            map.insert(
-                md.name().as_str().to_string(),
-                md.value().as_str().to_string(),
-            );
-        }
-        Ok(StorageMetadata(map))
-    }
-
-    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
-        self.concurrency_limiter
-            .acquire(kind)
-            .await
-            .expect("semaphore is never closed")
-    }
-}
-
-fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
-    let mut res = Metadata::new();
-    for (k, v) in metadata.0.into_iter() {
-        res.insert(k, v);
-    }
-    res
-}
-
-fn to_download_error(error: azure_core::Error) -> DownloadError {
-    if let Some(http_err) = error.as_http_error() {
-        match http_err.status() {
-            StatusCode::NotFound => DownloadError::NotFound,
-            StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(error)),
-            _ => DownloadError::Other(anyhow::Error::new(error)),
-        }
-    } else {
-        DownloadError::Other(error.into())
-    }
-}
-
-#[async_trait::async_trait]
-impl RemoteStorage for AzureBlobStorage {
-    async fn list_prefixes(
-        &self,
-        prefix: Option<&RemotePath>,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        // get the passed prefix or if it is not set use prefix_in_bucket value
-        let list_prefix = prefix
-            .map(|p| self.relative_path_to_name(p))
-            .or_else(|| self.prefix_in_container.clone())
-            .map(|mut p| {
-                // required to end with a separator
-                // otherwise request will return only the entry of a prefix
-                if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
-                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-                }
-                p
-            });
-
-        let mut builder = self
-            .client
-            .list_blobs()
-            .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
-
-        if let Some(prefix) = list_prefix {
-            builder = builder.prefix(Cow::from(prefix.to_owned()));
-        }
-
-        if let Some(limit) = self.max_keys_per_list_response {
-            builder = builder.max_results(MaxResults::new(limit));
-        }
-
-        let mut response = builder.into_stream();
-        let mut res = Vec::new();
-        while let Some(entry) = response.next().await {
-            let entry = entry.map_err(to_download_error)?;
-            let name_iter = entry
-                .blobs
-                .prefixes()
-                .map(|prefix| self.name_to_relative_path(&prefix.name));
-            res.extend(name_iter);
-        }
-        Ok(res)
-    }
-
-    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        let folder_name = folder
-            .map(|p| self.relative_path_to_name(p))
-            .or_else(|| self.prefix_in_container.clone());
-
-        let mut builder = self.client.list_blobs();
-
-        if let Some(folder_name) = folder_name {
-            builder = builder.prefix(Cow::from(folder_name.to_owned()));
-        }
-
-        if let Some(limit) = self.max_keys_per_list_response {
-            builder = builder.max_results(MaxResults::new(limit));
-        }
-
-        let mut response = builder.into_stream();
-        let mut res = Vec::new();
-        while let Some(l) = response.next().await {
-            let entry = l.map_err(anyhow::Error::new)?;
-            let name_iter = entry
-                .blobs
-                .blobs()
-                .map(|bl| self.name_to_relative_path(&bl.name));
-            res.extend(name_iter);
-        }
-        Ok(res)
-    }
-
-    async fn upload(
-        &self,
-        mut from: impl AsyncRead + Unpin + Send + Sync + 'static,
-        data_size_bytes: usize,
-        to: &RemotePath,
-        metadata: Option<StorageMetadata>,
-    ) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Put).await;
-        let blob_client = self.client.blob_client(self.relative_path_to_name(to));
-
-        // TODO FIX THIS UGLY HACK and don't buffer the entire object
-        // into RAM here, but use the streaming interface. For that,
-        // we'd have to change the interface though...
-        // https://github.com/neondatabase/neon/issues/5563
-        let mut buf = Vec::with_capacity(data_size_bytes);
-        tokio::io::copy(&mut from, &mut buf).await?;
-        let body = azure_core::Body::Bytes(buf.into());
-
-        let mut builder = blob_client.put_block_blob(body);
-
-        if let Some(metadata) = metadata {
-            builder = builder.metadata(to_azure_metadata(metadata));
-        }
-
-        let _response = builder.into_future().await?;
-
-        Ok(())
-    }
-
-    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
-        let _permit = self.permit(RequestKind::Get).await;
-        let blob_client = self.client.blob_client(self.relative_path_to_name(from));
-
-        let metadata = self.get_metadata(&blob_client).await?;
-
-        let builder = blob_client.get();
-
-        self.download_for_builder(metadata, builder).await
-    }
-
-    async fn download_byte_range(
-        &self,
-        from: &RemotePath,
-        start_inclusive: u64,
-        end_exclusive: Option<u64>,
-    ) -> Result<Download, DownloadError> {
-        let _permit = self.permit(RequestKind::Get).await;
-        let blob_client = self.client.blob_client(self.relative_path_to_name(from));
-
-        let metadata = self.get_metadata(&blob_client).await?;
-
-        let mut builder = blob_client.get();
-
-        if let Some(end_exclusive) = end_exclusive {
-            builder = builder.range(Range::new(start_inclusive, end_exclusive));
-        } else {
-            // Open ranges are not supported by the SDK so we work around
-            // by setting the upper limit extremely high (but high enough
-            // to still be representable by signed 64 bit integers).
-            // TODO remove workaround once the SDK adds open range support
-            // https://github.com/Azure/azure-sdk-for-rust/issues/1438
-            let end_exclusive = u64::MAX / 4;
-            builder = builder.range(Range::new(start_inclusive, end_exclusive));
-        }
-
-        self.download_for_builder(metadata, builder).await
-    }
-
-    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Delete).await;
-        let blob_client = self.client.blob_client(self.relative_path_to_name(path));
-
-        let builder = blob_client.delete();
-
-        match builder.into_future().await {
-            Ok(_response) => Ok(()),
-            Err(e) => {
-                if let Some(http_err) = e.as_http_error() {
-                    if http_err.status() == StatusCode::NotFound {
-                        return Ok(());
-                    }
-                }
-                Err(anyhow::Error::new(e))
-            }
-        }
-    }
-
-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
-        // Permit is already obtained by inner delete function
-
-        // TODO batch requests are also not supported by the SDK
-        // https://github.com/Azure/azure-sdk-for-rust/issues/1068
-        // https://github.com/Azure/azure-sdk-for-rust/issues/1249
-        for path in paths {
-            self.delete(path).await?;
-        }
-        Ok(())
-    }
-}
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -4,10 +4,7 @@
 //! [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
 //!   * [`local_fs`] allows to use local file system as an external storage
 //!   * [`s3_bucket`] uses AWS S3 bucket as an external storage
-//!   * [`azure_blob`] allows to use Azure Blob storage as an external storage
 //!
-
-mod azure_blob;
 mod local_fs;
 mod s3_bucket;
 mod simulate_failures;
@@ -24,15 +21,11 @@ use anyhow::{bail, Context};
 use camino::{Utf8Path, Utf8PathBuf};

 use serde::{Deserialize, Serialize};
-use tokio::{io, sync::Semaphore};
+use tokio::io;
 use toml_edit::Item;
 use tracing::info;

-pub use self::{
-    azure_blob::AzureBlobStorage, local_fs::LocalFs, s3_bucket::S3Bucket,
-    simulate_failures::UnreliableWrapper,
-};
-use s3_bucket::RequestKind;
+pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket, simulate_failures::UnreliableWrapper};

 /// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
 /// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
@@ -46,11 +39,6 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
 /// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
 pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
-/// We set this a little bit low as we currently buffer the entire file into RAM
-///
-/// Here, a limit of max 20k concurrent connections was noted.
-/// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
-pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 30;
 /// No limits on the client side, which currenltly means 1000 for AWS S3.
 /// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
@@ -229,7 +217,6 @@ impl std::error::Error for DownloadError {}
 pub enum GenericRemoteStorage {
    LocalFs(LocalFs),
    AwsS3(Arc<S3Bucket>),
-    AzureBlob(Arc<AzureBlobStorage>),
    Unreliable(Arc<UnreliableWrapper>),
 }

@@ -241,7 +228,6 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.list_files(folder).await,
            Self::AwsS3(s) => s.list_files(folder).await,
-            Self::AzureBlob(s) => s.list_files(folder).await,
            Self::Unreliable(s) => s.list_files(folder).await,
        }
    }
@@ -256,7 +242,6 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.list_prefixes(prefix).await,
            Self::AwsS3(s) => s.list_prefixes(prefix).await,
-            Self::AzureBlob(s) => s.list_prefixes(prefix).await,
            Self::Unreliable(s) => s.list_prefixes(prefix).await,
        }
    }
@@ -271,7 +256,6 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata).await,
            Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata).await,
-            Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata).await,
            Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata).await,
        }
    }
@@ -280,7 +264,6 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.download(from).await,
            Self::AwsS3(s) => s.download(from).await,
-            Self::AzureBlob(s) => s.download(from).await,
            Self::Unreliable(s) => s.download(from).await,
        }
    }
@@ -300,10 +283,6 @@ impl GenericRemoteStorage {
                s.download_byte_range(from, start_inclusive, end_exclusive)
                    .await
            }
-            Self::AzureBlob(s) => {
-                s.download_byte_range(from, start_inclusive, end_exclusive)
-                    .await
-            }
            Self::Unreliable(s) => {
                s.download_byte_range(from, start_inclusive, end_exclusive)
                    .await
@@ -315,7 +294,6 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.delete(path).await,
            Self::AwsS3(s) => s.delete(path).await,
-            Self::AzureBlob(s) => s.delete(path).await,
            Self::Unreliable(s) => s.delete(path).await,
        }
    }
@@ -324,7 +302,6 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.delete_objects(paths).await,
            Self::AwsS3(s) => s.delete_objects(paths).await,
-            Self::AzureBlob(s) => s.delete_objects(paths).await,
            Self::Unreliable(s) => s.delete_objects(paths).await,
        }
    }
@@ -342,11 +319,6 @@ impl GenericRemoteStorage {
                      s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
                Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
            }
-            RemoteStorageKind::AzureContainer(azure_config) => {
-                info!("Using azure container '{}' in region '{}' as a remote storage, prefix in container: '{:?}'",
-                      azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container);
-                Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config)?))
-            }
        })
    }

@@ -411,9 +383,6 @@ pub enum RemoteStorageKind {
    /// AWS S3 based storage, storing all files in the S3 bucket
    /// specified by the config
    AwsS3(S3Config),
-    /// Azure Blob based storage, storing all files in the container
-    /// specified by the config
-    AzureContainer(AzureConfig),
 }

 /// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
@@ -453,45 +422,11 @@ impl Debug for S3Config {
    }
 }

-/// Azure  bucket coordinates and access credentials to manage the bucket contents (read and write).
-#[derive(Clone, PartialEq, Eq)]
-pub struct AzureConfig {
-    /// Name of the container to connect to.
-    pub container_name: String,
-    /// The region where the bucket is located at.
-    pub container_region: String,
-    /// A "subfolder" in the container, to use the same container separately by multiple remote storage users at once.
-    pub prefix_in_container: Option<String>,
-    /// Azure has various limits on its API calls, we need not to exceed those.
-    /// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details.
-    pub concurrency_limit: NonZeroUsize,
-    pub max_keys_per_list_response: Option<i32>,
-}
-
-impl Debug for AzureConfig {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("AzureConfig")
-            .field("bucket_name", &self.container_name)
-            .field("bucket_region", &self.container_region)
-            .field("prefix_in_bucket", &self.prefix_in_container)
-            .field("concurrency_limit", &self.concurrency_limit)
-            .field(
-                "max_keys_per_list_response",
-                &self.max_keys_per_list_response,
-            )
-            .finish()
-    }
-}
-
 impl RemoteStorageConfig {
    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
        let local_path = toml.get("local_path");
        let bucket_name = toml.get("bucket_name");
        let bucket_region = toml.get("bucket_region");
-        let container_name = toml.get("container_name");
-        let container_region = toml.get("container_region");
-
-        let use_azure = container_name.is_some() && container_region.is_some();

        let max_concurrent_syncs = NonZeroUsize::new(
            parse_optional_integer("max_concurrent_syncs", toml)?
@@ -505,13 +440,9 @@ impl RemoteStorageConfig {
        )
        .context("Failed to parse 'max_sync_errors' as a positive integer")?;

-        let default_concurrency_limit = if use_azure {
-            DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
-        } else {
-            DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
-        };
        let concurrency_limit = NonZeroUsize::new(
-            parse_optional_integer("concurrency_limit", toml)?.unwrap_or(default_concurrency_limit),
+            parse_optional_integer("concurrency_limit", toml)?
+                .unwrap_or(DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT),
        )
        .context("Failed to parse 'concurrency_limit' as a positive integer")?;

@@ -520,70 +451,33 @@ impl RemoteStorageConfig {
                .context("Failed to parse 'max_keys_per_list_response' as a positive integer")?
                .or(DEFAULT_MAX_KEYS_PER_LIST_RESPONSE);

-        let endpoint = toml
-            .get("endpoint")
-            .map(|endpoint| parse_toml_string("endpoint", endpoint))
-            .transpose()?;
-
-        let storage = match (
-            local_path,
-            bucket_name,
-            bucket_region,
-            container_name,
-            container_region,
-        ) {
+        let storage = match (local_path, bucket_name, bucket_region) {
            // no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled
-            (None, None, None, None, None) => return Ok(None),
-            (_, Some(_), None, ..) => {
+            (None, None, None) => return Ok(None),
+            (_, Some(_), None) => {
                bail!("'bucket_region' option is mandatory if 'bucket_name' is given ")
            }
-            (_, None, Some(_), ..) => {
+            (_, None, Some(_)) => {
                bail!("'bucket_name' option is mandatory if 'bucket_region' is given ")
            }
-            (None, Some(bucket_name), Some(bucket_region), ..) => {
-                RemoteStorageKind::AwsS3(S3Config {
-                    bucket_name: parse_toml_string("bucket_name", bucket_name)?,
-                    bucket_region: parse_toml_string("bucket_region", bucket_region)?,
-                    prefix_in_bucket: toml
-                        .get("prefix_in_bucket")
-                        .map(|prefix_in_bucket| {
-                            parse_toml_string("prefix_in_bucket", prefix_in_bucket)
-                        })
-                        .transpose()?,
-                    endpoint,
-                    concurrency_limit,
-                    max_keys_per_list_response,
-                })
-            }
-            (_, _, _, Some(_), None) => {
-                bail!("'container_name' option is mandatory if 'container_region' is given ")
-            }
-            (_, _, _, None, Some(_)) => {
-                bail!("'container_name' option is mandatory if 'container_region' is given ")
-            }
-            (None, None, None, Some(container_name), Some(container_region)) => {
-                RemoteStorageKind::AzureContainer(AzureConfig {
-                    container_name: parse_toml_string("container_name", container_name)?,
-                    container_region: parse_toml_string("container_region", container_region)?,
-                    prefix_in_container: toml
-                        .get("prefix_in_container")
-                        .map(|prefix_in_container| {
-                            parse_toml_string("prefix_in_container", prefix_in_container)
-                        })
-                        .transpose()?,
-                    concurrency_limit,
-                    max_keys_per_list_response,
-                })
-            }
-            (Some(local_path), None, None, None, None) => RemoteStorageKind::LocalFs(
-                Utf8PathBuf::from(parse_toml_string("local_path", local_path)?),
-            ),
-            (Some(_), Some(_), ..) => {
-                bail!("'local_path' and 'bucket_name' are mutually exclusive")
-            }
-            (Some(_), _, _, Some(_), Some(_)) => {
-                bail!("local_path and 'container_name' are mutually exclusive")
-            }
+            (None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config {
+                bucket_name: parse_toml_string("bucket_name", bucket_name)?,
+                bucket_region: parse_toml_string("bucket_region", bucket_region)?,
+                prefix_in_bucket: toml
+                    .get("prefix_in_bucket")
+                    .map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket))
+                    .transpose()?,
+                endpoint: toml
+                    .get("endpoint")
+                    .map(|endpoint| parse_toml_string("endpoint", endpoint))
+                    .transpose()?,
+                concurrency_limit,
+                max_keys_per_list_response,
+            }),
+            (Some(local_path), None, None) => RemoteStorageKind::LocalFs(Utf8PathBuf::from(
+                parse_toml_string("local_path", local_path)?,
+            )),
+            (Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"),
        };

        Ok(Some(RemoteStorageConfig {
@@ -619,46 +513,6 @@ fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result<String> {
    Ok(s.to_string())
 }

-struct ConcurrencyLimiter {
-    // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
-    // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
-    // The helps to ensure we don't exceed the thresholds.
-    write: Arc<Semaphore>,
-    read: Arc<Semaphore>,
-}
-
-impl ConcurrencyLimiter {
-    fn for_kind(&self, kind: RequestKind) -> &Arc<Semaphore> {
-        match kind {
-            RequestKind::Get => &self.read,
-            RequestKind::Put => &self.write,
-            RequestKind::List => &self.read,
-            RequestKind::Delete => &self.write,
-        }
-    }
-
-    async fn acquire(
-        &self,
-        kind: RequestKind,
-    ) -> Result<tokio::sync::SemaphorePermit<'_>, tokio::sync::AcquireError> {
-        self.for_kind(kind).acquire().await
-    }
-
-    async fn acquire_owned(
-        &self,
-        kind: RequestKind,
-    ) -> Result<tokio::sync::OwnedSemaphorePermit, tokio::sync::AcquireError> {
-        Arc::clone(self.for_kind(kind)).acquire_owned().await
-    }
-
-    fn new(limit: usize) -> ConcurrencyLimiter {
-        Self {
-            read: Arc::new(Semaphore::new(limit)),
-            write: Arc::new(Semaphore::new(limit)),
-        }
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,7 +4,7 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.

-use std::borrow::Cow;
+use std::sync::Arc;

 use anyhow::Context;
 use aws_config::{
@@ -24,20 +24,22 @@ use aws_sdk_s3::{
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
 use scopeguard::ScopeGuard;
-use tokio::io::{self, AsyncRead};
+use tokio::{
+    io::{self, AsyncRead},
+    sync::Semaphore,
+};
 use tokio_util::io::ReaderStream;
 use tracing::debug;

 use super::StorageMetadata;
 use crate::{
-    ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage, S3Config,
-    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Download, DownloadError, RemotePath, RemoteStorage, S3Config, MAX_KEYS_PER_DELETE,
+    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 pub(super) mod metrics;

-use self::metrics::AttemptOutcome;
-pub(super) use self::metrics::RequestKind;
+use self::metrics::{AttemptOutcome, RequestKind};

 /// AWS S3 storage.
 pub struct S3Bucket {
@@ -48,6 +50,46 @@ pub struct S3Bucket {
    concurrency_limiter: ConcurrencyLimiter,
 }

+struct ConcurrencyLimiter {
+    // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
+    // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
+    // The helps to ensure we don't exceed the thresholds.
+    write: Arc<Semaphore>,
+    read: Arc<Semaphore>,
+}
+
+impl ConcurrencyLimiter {
+    fn for_kind(&self, kind: RequestKind) -> &Arc<Semaphore> {
+        match kind {
+            RequestKind::Get => &self.read,
+            RequestKind::Put => &self.write,
+            RequestKind::List => &self.read,
+            RequestKind::Delete => &self.write,
+        }
+    }
+
+    async fn acquire(
+        &self,
+        kind: RequestKind,
+    ) -> Result<tokio::sync::SemaphorePermit<'_>, tokio::sync::AcquireError> {
+        self.for_kind(kind).acquire().await
+    }
+
+    async fn acquire_owned(
+        &self,
+        kind: RequestKind,
+    ) -> Result<tokio::sync::OwnedSemaphorePermit, tokio::sync::AcquireError> {
+        Arc::clone(self.for_kind(kind)).acquire_owned().await
+    }
+
+    fn new(limit: usize) -> ConcurrencyLimiter {
+        Self {
+            read: Arc::new(Semaphore::new(limit)),
+            write: Arc::new(Semaphore::new(limit)),
+        }
+    }
+}
+
 #[derive(Default)]
 struct GetObjectRequest {
    bucket: String,
@@ -514,20 +556,6 @@ impl RemoteStorage for S3Bucket {
                        .deleted_objects_total
                        .inc_by(chunk.len() as u64);
                    if let Some(errors) = resp.errors {
-                        // Log a bounded number of the errors within the response:
-                        // these requests can carry 1000 keys so logging each one
-                        // would be too verbose, especially as errors may lead us
-                        // to retry repeatedly.
-                        const LOG_UP_TO_N_ERRORS: usize = 10;
-                        for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
-                            tracing::warn!(
-                                "DeleteObjects key {} failed: {}: {}",
-                                e.key.as_ref().map(Cow::from).unwrap_or("".into()),
-                                e.code.as_ref().map(Cow::from).unwrap_or("".into()),
-                                e.message.as_ref().map(Cow::from).unwrap_or("".into())
-                            );
-                        }
-
                        return Err(anyhow::format_err!(
                            "Failed to delete {} objects",
                            errors.len()
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -6,7 +6,7 @@ use once_cell::sync::Lazy;
 pub(super) static BUCKET_METRICS: Lazy<BucketMetrics> = Lazy::new(Default::default);

 #[derive(Clone, Copy, Debug)]
-pub(crate) enum RequestKind {
+pub(super) enum RequestKind {
    Get = 0,
    Put = 1,
    Delete = 2,
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -1,625 +0,0 @@
-use std::collections::HashSet;
-use std::env;
-use std::num::{NonZeroU32, NonZeroUsize};
-use std::ops::ControlFlow;
-use std::path::PathBuf;
-use std::sync::Arc;
-use std::time::UNIX_EPOCH;
-
-use anyhow::Context;
-use camino::Utf8Path;
-use once_cell::sync::OnceCell;
-use remote_storage::{
-    AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
-};
-use test_context::{test_context, AsyncTestContext};
-use tokio::task::JoinSet;
-use tracing::{debug, error, info};
-
-static LOGGING_DONE: OnceCell<()> = OnceCell::new();
-
-const ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_AZURE_REMOTE_STORAGE";
-
-const BASE_PREFIX: &str = "test";
-
-/// Tests that the Azure client can list all prefixes, even if the response comes paginated and requires multiple HTTP queries.
-/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified.
-/// See the client creation in [`create_azure_client`] for details on the required env vars.
-/// If real Azure tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
-/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
-///
-/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_azure_data`]
-/// where
-/// * `random_prefix_part` is set for the entire Azure client during the Azure client creation in [`create_azure_client`], to avoid multiple test runs interference
-/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
-///
-/// Then, verifies that the client does return correct prefixes when queried:
-/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
-/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
-///
-/// With the real Azure enabled and `#[cfg(test)]` Rust configuration used, the Azure client test adds a `max-keys` param to limit the response keys.
-/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to Azure.
-///
-/// Lastly, the test attempts to clean up and remove all uploaded Azure files.
-/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
-#[test_context(MaybeEnabledAzureWithTestBlobs)]
-#[tokio::test]
-async fn azure_pagination_should_work(
-    ctx: &mut MaybeEnabledAzureWithTestBlobs,
-) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledAzureWithTestBlobs::Enabled(ctx) => ctx,
-        MaybeEnabledAzureWithTestBlobs::Disabled => return Ok(()),
-        MaybeEnabledAzureWithTestBlobs::UploadsFailed(e, _) => {
-            anyhow::bail!("Azure init failed: {e:?}")
-        }
-    };
-
-    let test_client = Arc::clone(&ctx.enabled.client);
-    let expected_remote_prefixes = ctx.remote_prefixes.clone();
-
-    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
-        .context("common_prefix construction")?;
-    let root_remote_prefixes = test_client
-        .list_prefixes(None)
-        .await
-        .context("client list root prefixes failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
-        "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
-    );
-
-    let nested_remote_prefixes = test_client
-        .list_prefixes(Some(&base_prefix))
-        .await
-        .context("client list nested prefixes failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    let remote_only_prefixes = nested_remote_prefixes
-        .difference(&expected_remote_prefixes)
-        .collect::<HashSet<_>>();
-    let missing_uploaded_prefixes = expected_remote_prefixes
-        .difference(&nested_remote_prefixes)
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
-        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
-    );
-
-    Ok(())
-}
-
-/// Tests that Azure client can list all files in a folder, even if the response comes paginated and requirees multiple Azure queries.
-/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified. Test will skip real code and pass if env vars not set.
-/// See `Azure_pagination_should_work` for more information.
-///
-/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_azure_data`]
-/// Then performs the following queries:
-///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
-///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
-#[test_context(MaybeEnabledAzureWithSimpleTestBlobs)]
-#[tokio::test]
-async fn azure_list_files_works(
-    ctx: &mut MaybeEnabledAzureWithSimpleTestBlobs,
-) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledAzureWithSimpleTestBlobs::Enabled(ctx) => ctx,
-        MaybeEnabledAzureWithSimpleTestBlobs::Disabled => return Ok(()),
-        MaybeEnabledAzureWithSimpleTestBlobs::UploadsFailed(e, _) => {
-            anyhow::bail!("Azure init failed: {e:?}")
-        }
-    };
-    let test_client = Arc::clone(&ctx.enabled.client);
-    let base_prefix =
-        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
-    let root_files = test_client
-        .list_files(None)
-        .await
-        .context("client list root files failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        root_files,
-        ctx.remote_blobs.clone(),
-        "remote storage list_files on root mismatches with the uploads."
-    );
-    let nested_remote_files = test_client
-        .list_files(Some(&base_prefix))
-        .await
-        .context("client list nested files failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    let trim_remote_blobs: HashSet<_> = ctx
-        .remote_blobs
-        .iter()
-        .map(|x| x.get_path())
-        .filter(|x| x.starts_with("folder1"))
-        .map(|x| RemotePath::new(x).expect("must be valid path"))
-        .collect();
-    assert_eq!(
-        nested_remote_files, trim_remote_blobs,
-        "remote storage list_files on subdirrectory mismatches with the uploads."
-    );
-    Ok(())
-}
-
-#[test_context(MaybeEnabledAzure)]
-#[tokio::test]
-async fn azure_delete_non_exising_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledAzure::Enabled(ctx) => ctx,
-        MaybeEnabledAzure::Disabled => return Ok(()),
-    };
-
-    let path = RemotePath::new(Utf8Path::new(
-        format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
-    ))
-    .with_context(|| "RemotePath conversion")?;
-
-    ctx.client.delete(&path).await.expect("should succeed");
-
-    Ok(())
-}
-
-#[test_context(MaybeEnabledAzure)]
-#[tokio::test]
-async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledAzure::Enabled(ctx) => ctx,
-        MaybeEnabledAzure::Disabled => return Ok(()),
-    };
-
-    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let data1 = "remote blob data1".as_bytes();
-    let data1_len = data1.len();
-    let data2 = "remote blob data2".as_bytes();
-    let data2_len = data2.len();
-    let data3 = "remote blob data3".as_bytes();
-    let data3_len = data3.len();
-    ctx.client
-        .upload(std::io::Cursor::new(data1), data1_len, &path1, None)
-        .await?;
-
-    ctx.client
-        .upload(std::io::Cursor::new(data2), data2_len, &path2, None)
-        .await?;
-
-    ctx.client
-        .upload(std::io::Cursor::new(data3), data3_len, &path3, None)
-        .await?;
-
-    ctx.client.delete_objects(&[path1, path2]).await?;
-
-    let prefixes = ctx.client.list_prefixes(None).await?;
-
-    assert_eq!(prefixes.len(), 1);
-
-    ctx.client.delete_objects(&[path3]).await?;
-
-    Ok(())
-}
-
-#[test_context(MaybeEnabledAzure)]
-#[tokio::test]
-async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
-    let MaybeEnabledAzure::Enabled(ctx) = ctx else {
-        return Ok(());
-    };
-
-    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let data = "remote blob data here".as_bytes();
-    let data_len = data.len() as u64;
-
-    ctx.client
-        .upload(std::io::Cursor::new(data), data.len(), &path, None)
-        .await?;
-
-    async fn download_and_compare(mut dl: Download) -> anyhow::Result<Vec<u8>> {
-        let mut buf = Vec::new();
-        tokio::io::copy(&mut dl.download_stream, &mut buf).await?;
-        Ok(buf)
-    }
-    // Normal download request
-    let dl = ctx.client.download(&path).await?;
-    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data);
-
-    // Full range (end specified)
-    let dl = ctx
-        .client
-        .download_byte_range(&path, 0, Some(data_len))
-        .await?;
-    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data);
-
-    // partial range (end specified)
-    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
-    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data[4..10]);
-
-    // partial range (end beyond real end)
-    let dl = ctx
-        .client
-        .download_byte_range(&path, 8, Some(data_len * 100))
-        .await?;
-    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data[8..]);
-
-    // Partial range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
-    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data[4..]);
-
-    // Full range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
-    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data);
-
-    debug!("Cleanup: deleting file at path {path:?}");
-    ctx.client
-        .delete(&path)
-        .await
-        .with_context(|| format!("{path:?} removal"))?;
-
-    Ok(())
-}
-
-fn ensure_logging_ready() {
-    LOGGING_DONE.get_or_init(|| {
-        utils::logging::init(
-            utils::logging::LogFormat::Test,
-            utils::logging::TracingErrorLayerEnablement::Disabled,
-        )
-        .expect("logging init failed");
-    });
-}
-
-struct EnabledAzure {
-    client: Arc<GenericRemoteStorage>,
-    base_prefix: &'static str,
-}
-
-impl EnabledAzure {
-    async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
-        let client = create_azure_client(max_keys_in_list_response)
-            .context("Azure client creation")
-            .expect("Azure client creation failed");
-
-        EnabledAzure {
-            client,
-            base_prefix: BASE_PREFIX,
-        }
-    }
-}
-
-enum MaybeEnabledAzure {
-    Enabled(EnabledAzure),
-    Disabled,
-}
-
-#[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledAzure {
-    async fn setup() -> Self {
-        ensure_logging_ready();
-
-        if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
-            info!(
-                "`{}` env variable is not set, skipping the test",
-                ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME
-            );
-            return Self::Disabled;
-        }
-
-        Self::Enabled(EnabledAzure::setup(None).await)
-    }
-}
-
-enum MaybeEnabledAzureWithTestBlobs {
-    Enabled(AzureWithTestBlobs),
-    Disabled,
-    UploadsFailed(anyhow::Error, AzureWithTestBlobs),
-}
-
-struct AzureWithTestBlobs {
-    enabled: EnabledAzure,
-    remote_prefixes: HashSet<RemotePath>,
-    remote_blobs: HashSet<RemotePath>,
-}
-
-#[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs {
-    async fn setup() -> Self {
-        ensure_logging_ready();
-        if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
-            info!(
-                "`{}` env variable is not set, skipping the test",
-                ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME
-            );
-            return Self::Disabled;
-        }
-
-        let max_keys_in_list_response = 10;
-        let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
-
-        let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;
-
-        match upload_azure_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
-            ControlFlow::Continue(uploads) => {
-                info!("Remote objects created successfully");
-
-                Self::Enabled(AzureWithTestBlobs {
-                    enabled,
-                    remote_prefixes: uploads.prefixes,
-                    remote_blobs: uploads.blobs,
-                })
-            }
-            ControlFlow::Break(uploads) => Self::UploadsFailed(
-                anyhow::anyhow!("One or multiple blobs failed to upload to Azure"),
-                AzureWithTestBlobs {
-                    enabled,
-                    remote_prefixes: uploads.prefixes,
-                    remote_blobs: uploads.blobs,
-                },
-            ),
-        }
-    }
-
-    async fn teardown(self) {
-        match self {
-            Self::Disabled => {}
-            Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
-                cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
-            }
-        }
-    }
-}
-
-// NOTE: the setups for the list_prefixes test and the list_files test are very similar
-// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
-// whereas the list_files function is concerned with listing files.
-// See `RemoteStorage::list_files` documentation for more details
-enum MaybeEnabledAzureWithSimpleTestBlobs {
-    Enabled(AzureWithSimpleTestBlobs),
-    Disabled,
-    UploadsFailed(anyhow::Error, AzureWithSimpleTestBlobs),
-}
-struct AzureWithSimpleTestBlobs {
-    enabled: EnabledAzure,
-    remote_blobs: HashSet<RemotePath>,
-}
-
-#[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledAzureWithSimpleTestBlobs {
-    async fn setup() -> Self {
-        ensure_logging_ready();
-        if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
-            info!(
-                "`{}` env variable is not set, skipping the test",
-                ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME
-            );
-            return Self::Disabled;
-        }
-
-        let max_keys_in_list_response = 10;
-        let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
-
-        let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;
-
-        match upload_simple_azure_data(&enabled.client, upload_tasks_count).await {
-            ControlFlow::Continue(uploads) => {
-                info!("Remote objects created successfully");
-
-                Self::Enabled(AzureWithSimpleTestBlobs {
-                    enabled,
-                    remote_blobs: uploads,
-                })
-            }
-            ControlFlow::Break(uploads) => Self::UploadsFailed(
-                anyhow::anyhow!("One or multiple blobs failed to upload to Azure"),
-                AzureWithSimpleTestBlobs {
-                    enabled,
-                    remote_blobs: uploads,
-                },
-            ),
-        }
-    }
-
-    async fn teardown(self) {
-        match self {
-            Self::Disabled => {}
-            Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
-                cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
-            }
-        }
-    }
-}
-
-fn create_azure_client(
-    max_keys_per_list_response: Option<i32>,
-) -> anyhow::Result<Arc<GenericRemoteStorage>> {
-    use rand::Rng;
-
-    let remote_storage_azure_container = env::var("REMOTE_STORAGE_AZURE_CONTAINER").context(
-        "`REMOTE_STORAGE_AZURE_CONTAINER` env var is not set, but real Azure tests are enabled",
-    )?;
-    let remote_storage_azure_region = env::var("REMOTE_STORAGE_AZURE_REGION").context(
-        "`REMOTE_STORAGE_AZURE_REGION` env var is not set, but real Azure tests are enabled",
-    )?;
-
-    // due to how time works, we've had test runners use the same nanos as bucket prefixes.
-    // millis is just a debugging aid for easier finding the prefix later.
-    let millis = std::time::SystemTime::now()
-        .duration_since(UNIX_EPOCH)
-        .context("random Azure test prefix part calculation")?
-        .as_millis();
-
-    // because nanos can be the same for two threads so can millis, add randomness
-    let random = rand::thread_rng().gen::<u32>();
-
-    let remote_storage_config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
-        max_sync_errors: NonZeroU32::new(5).unwrap(),
-        storage: RemoteStorageKind::AzureContainer(AzureConfig {
-            container_name: remote_storage_azure_container,
-            container_region: remote_storage_azure_region,
-            prefix_in_container: Some(format!("test_{millis}_{random:08x}/")),
-            concurrency_limit: NonZeroUsize::new(100).unwrap(),
-            max_keys_per_list_response,
-        }),
-    };
-    Ok(Arc::new(
-        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
-    ))
-}
-
-struct Uploads {
-    prefixes: HashSet<RemotePath>,
-    blobs: HashSet<RemotePath>,
-}
-
-async fn upload_azure_data(
-    client: &Arc<GenericRemoteStorage>,
-    base_prefix_str: &'static str,
-    upload_tasks_count: usize,
-) -> ControlFlow<Uploads, Uploads> {
-    info!("Creating {upload_tasks_count} Azure files");
-    let mut upload_tasks = JoinSet::new();
-    for i in 1..upload_tasks_count + 1 {
-        let task_client = Arc::clone(client);
-        upload_tasks.spawn(async move {
-            let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
-            let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
-                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
-            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
-            debug!("Creating remote item {i} at path {blob_path:?}");
-
-            let data = format!("remote blob data {i}").into_bytes();
-            let data_len = data.len();
-            task_client
-                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
-                .await?;
-
-            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
-        });
-    }
-
-    let mut upload_tasks_failed = false;
-    let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
-    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
-    while let Some(task_run_result) = upload_tasks.join_next().await {
-        match task_run_result
-            .context("task join failed")
-            .and_then(|task_result| task_result.context("upload task failed"))
-        {
-            Ok((upload_prefix, upload_path)) => {
-                uploaded_prefixes.insert(upload_prefix);
-                uploaded_blobs.insert(upload_path);
-            }
-            Err(e) => {
-                error!("Upload task failed: {e:?}");
-                upload_tasks_failed = true;
-            }
-        }
-    }
-
-    let uploads = Uploads {
-        prefixes: uploaded_prefixes,
-        blobs: uploaded_blobs,
-    };
-    if upload_tasks_failed {
-        ControlFlow::Break(uploads)
-    } else {
-        ControlFlow::Continue(uploads)
-    }
-}
-
-async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
-    info!(
-        "Removing {} objects from the remote storage during cleanup",
-        objects_to_delete.len()
-    );
-    let mut delete_tasks = JoinSet::new();
-    for object_to_delete in objects_to_delete {
-        let task_client = Arc::clone(client);
-        delete_tasks.spawn(async move {
-            debug!("Deleting remote item at path {object_to_delete:?}");
-            task_client
-                .delete(&object_to_delete)
-                .await
-                .with_context(|| format!("{object_to_delete:?} removal"))
-        });
-    }
-
-    while let Some(task_run_result) = delete_tasks.join_next().await {
-        match task_run_result {
-            Ok(task_result) => match task_result {
-                Ok(()) => {}
-                Err(e) => error!("Delete task failed: {e:?}"),
-            },
-            Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
-        }
-    }
-}
-
-// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
-async fn upload_simple_azure_data(
-    client: &Arc<GenericRemoteStorage>,
-    upload_tasks_count: usize,
-) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
-    info!("Creating {upload_tasks_count} Azure files");
-    let mut upload_tasks = JoinSet::new();
-    for i in 1..upload_tasks_count + 1 {
-        let task_client = Arc::clone(client);
-        upload_tasks.spawn(async move {
-            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
-            let blob_path = RemotePath::new(
-                Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
-            )
-            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
-            debug!("Creating remote item {i} at path {blob_path:?}");
-
-            let data = format!("remote blob data {i}").into_bytes();
-            let data_len = data.len();
-            task_client
-                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
-                .await?;
-
-            Ok::<_, anyhow::Error>(blob_path)
-        });
-    }
-
-    let mut upload_tasks_failed = false;
-    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
-    while let Some(task_run_result) = upload_tasks.join_next().await {
-        match task_run_result
-            .context("task join failed")
-            .and_then(|task_result| task_result.context("upload task failed"))
-        {
-            Ok(upload_path) => {
-                uploaded_blobs.insert(upload_path);
-            }
-            Err(e) => {
-                error!("Upload task failed: {e:?}");
-                upload_tasks_failed = true;
-            }
-        }
-    }
-
-    if upload_tasks_failed {
-        ControlFlow::Break(uploaded_blobs)
-    } else {
-        ControlFlow::Continue(uploaded_blobs)
-    }
-}
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -1,9 +1,8 @@
 use hyper::{header, Body, Response, StatusCode};
 use serde::{Deserialize, Serialize};
-use std::borrow::Cow;
 use std::error::Error as StdError;
 use thiserror::Error;
-use tracing::{error, info};
+use tracing::error;

 #[derive(Debug, Error)]
 pub enum ApiError {
@@ -26,7 +25,7 @@ pub enum ApiError {
    PreconditionFailed(Box<str>),

    #[error("Resource temporarily unavailable: {0}")]
-    ResourceUnavailable(Cow<'static, str>),
+    ResourceUnavailable(String),

    #[error("Shutting down")]
    ShuttingDown,
@@ -116,12 +115,10 @@ pub async fn route_error_handler(err: routerify::RouteError) -> Response<Body> {

 pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
    // Print a stack trace for Internal Server errors
-
-    match api_error {
-        ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
-        ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
-        ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
-        _ => error!("Error processing HTTP request: {api_error:#}"),
+    if let ApiError::InternalServerError(_) = api_error {
+        error!("Error processing HTTP request: {api_error:?}");
+    } else {
+        error!("Error processing HTTP request: {api_error:#}");
    }

    api_error.into_response()
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -73,8 +73,6 @@ pub mod completion;
 /// Reporting utilities
 pub mod error;

-pub mod sync;
-
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -58,7 +58,7 @@ where
 // to get that.
 impl<T: Ord> PartialOrd for Waiter<T> {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        Some(self.cmp(other))
+        other.wake_num.partial_cmp(&self.wake_num)
    }
 }

--- a/libs/utils/src/sync.rs
+++ b/libs/utils/src/sync.rs
@@ -1 +0,0 @@
-pub mod heavier_once_cell;
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -1,306 +0,0 @@
-use std::sync::{Arc, Mutex, MutexGuard};
-use tokio::sync::Semaphore;
-
-/// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
-/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
-/// for the duration of initialization.
-///
-/// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
-///
-/// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
-pub struct OnceCell<T> {
-    inner: Mutex<Inner<T>>,
-}
-
-impl<T> Default for OnceCell<T> {
-    /// Create new uninitialized [`OnceCell`].
-    fn default() -> Self {
-        Self {
-            inner: Default::default(),
-        }
-    }
-}
-
-/// Semaphore is the current state:
-/// - open semaphore means the value is `None`, not yet initialized
-/// - closed semaphore means the value has been initialized
-#[derive(Debug)]
-struct Inner<T> {
-    init_semaphore: Arc<Semaphore>,
-    value: Option<T>,
-}
-
-impl<T> Default for Inner<T> {
-    fn default() -> Self {
-        Self {
-            init_semaphore: Arc::new(Semaphore::new(1)),
-            value: None,
-        }
-    }
-}
-
-impl<T> OnceCell<T> {
-    /// Creates an already initialized `OnceCell` with the given value.
-    pub fn new(value: T) -> Self {
-        let sem = Semaphore::new(1);
-        sem.close();
-        Self {
-            inner: Mutex::new(Inner {
-                init_semaphore: Arc::new(sem),
-                value: Some(value),
-            }),
-        }
-    }
-
-    /// Returns a guard to an existing initialized value, or uniquely initializes the value before
-    /// returning the guard.
-    ///
-    /// Initializing might wait on any existing [`Guard::take_and_deinit`] deinitialization.
-    ///
-    /// Initialization is panic-safe and cancellation-safe.
-    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
-    where
-        F: FnOnce() -> Fut,
-        Fut: std::future::Future<Output = Result<T, E>>,
-    {
-        let sem = {
-            let guard = self.inner.lock().unwrap();
-            if guard.value.is_some() {
-                return Ok(Guard(guard));
-            }
-            guard.init_semaphore.clone()
-        };
-
-        let permit = sem.acquire_owned().await;
-        if permit.is_err() {
-            let guard = self.inner.lock().unwrap();
-            assert!(
-                guard.value.is_some(),
-                "semaphore got closed, must be initialized"
-            );
-            return Ok(Guard(guard));
-        } else {
-            // now we try
-            let value = factory().await?;
-
-            let mut guard = self.inner.lock().unwrap();
-            assert!(
-                guard.value.is_none(),
-                "we won permit, must not be initialized"
-            );
-            guard.value = Some(value);
-            guard.init_semaphore.close();
-            Ok(Guard(guard))
-        }
-    }
-
-    /// Returns a guard to an existing initialized value, if any.
-    pub fn get(&self) -> Option<Guard<'_, T>> {
-        let guard = self.inner.lock().unwrap();
-        if guard.value.is_some() {
-            Some(Guard(guard))
-        } else {
-            None
-        }
-    }
-}
-
-/// Uninteresting guard object to allow short-lived access to inspect or clone the held,
-/// initialized value.
-#[derive(Debug)]
-pub struct Guard<'a, T>(MutexGuard<'a, Inner<T>>);
-
-impl<T> std::ops::Deref for Guard<'_, T> {
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        self.0
-            .value
-            .as_ref()
-            .expect("guard is not created unless value has been initialized")
-    }
-}
-
-impl<T> std::ops::DerefMut for Guard<'_, T> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        self.0
-            .value
-            .as_mut()
-            .expect("guard is not created unless value has been initialized")
-    }
-}
-
-impl<'a, T> Guard<'a, T> {
-    /// Take the current value, and a new permit for it's deinitialization.
-    ///
-    /// The permit will be on a semaphore part of the new internal value, and any following
-    /// [`OnceCell::get_or_init`] will wait on it to complete.
-    pub fn take_and_deinit(&mut self) -> (T, tokio::sync::OwnedSemaphorePermit) {
-        let mut swapped = Inner::default();
-        let permit = swapped
-            .init_semaphore
-            .clone()
-            .try_acquire_owned()
-            .expect("we just created this");
-        std::mem::swap(&mut *self.0, &mut swapped);
-        swapped
-            .value
-            .map(|v| (v, permit))
-            .expect("guard is not created unless value has been initialized")
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use std::{
-        convert::Infallible,
-        sync::atomic::{AtomicUsize, Ordering},
-        time::Duration,
-    };
-
-    #[tokio::test]
-    async fn many_initializers() {
-        #[derive(Default, Debug)]
-        struct Counters {
-            factory_got_to_run: AtomicUsize,
-            future_polled: AtomicUsize,
-            winners: AtomicUsize,
-        }
-
-        let initializers = 100;
-
-        let cell = Arc::new(OnceCell::default());
-        let counters = Arc::new(Counters::default());
-        let barrier = Arc::new(tokio::sync::Barrier::new(initializers + 1));
-
-        let mut js = tokio::task::JoinSet::new();
-        for i in 0..initializers {
-            js.spawn({
-                let cell = cell.clone();
-                let counters = counters.clone();
-                let barrier = barrier.clone();
-
-                async move {
-                    barrier.wait().await;
-                    let won = {
-                        let g = cell
-                            .get_or_init(|| {
-                                counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
-                                async {
-                                    counters.future_polled.fetch_add(1, Ordering::Relaxed);
-                                    Ok::<_, Infallible>(i)
-                                }
-                            })
-                            .await
-                            .unwrap();
-
-                        *g == i
-                    };
-
-                    if won {
-                        counters.winners.fetch_add(1, Ordering::Relaxed);
-                    }
-                }
-            });
-        }
-
-        barrier.wait().await;
-
-        while let Some(next) = js.join_next().await {
-            next.expect("no panics expected");
-        }
-
-        let mut counters = Arc::try_unwrap(counters).unwrap();
-
-        assert_eq!(*counters.factory_got_to_run.get_mut(), 1);
-        assert_eq!(*counters.future_polled.get_mut(), 1);
-        assert_eq!(*counters.winners.get_mut(), 1);
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn reinit_waits_for_deinit() {
-        // with the tokio::time paused, we will "sleep" for 1s while holding the reinitialization
-        let sleep_for = Duration::from_secs(1);
-        let initial = 42;
-        let reinit = 1;
-        let cell = Arc::new(OnceCell::new(initial));
-
-        let deinitialization_started = Arc::new(tokio::sync::Barrier::new(2));
-
-        let jh = tokio::spawn({
-            let cell = cell.clone();
-            let deinitialization_started = deinitialization_started.clone();
-            async move {
-                let (answer, _permit) = cell.get().expect("initialized to value").take_and_deinit();
-                assert_eq!(answer, initial);
-
-                deinitialization_started.wait().await;
-                tokio::time::sleep(sleep_for).await;
-            }
-        });
-
-        deinitialization_started.wait().await;
-
-        let started_at = tokio::time::Instant::now();
-        cell.get_or_init(|| async { Ok::<_, Infallible>(reinit) })
-            .await
-            .unwrap();
-
-        let elapsed = started_at.elapsed();
-        assert!(
-            elapsed >= sleep_for,
-            "initialization should had taken at least the time time slept with permit"
-        );
-
-        jh.await.unwrap();
-
-        assert_eq!(*cell.get().unwrap(), reinit);
-    }
-
-    #[tokio::test]
-    async fn initialization_attemptable_until_ok() {
-        let cell = OnceCell::default();
-
-        for _ in 0..10 {
-            cell.get_or_init(|| async { Err("whatever error") })
-                .await
-                .unwrap_err();
-        }
-
-        let g = cell
-            .get_or_init(|| async { Ok::<_, Infallible>("finally success") })
-            .await
-            .unwrap();
-        assert_eq!(*g, "finally success");
-    }
-
-    #[tokio::test]
-    async fn initialization_is_cancellation_safe() {
-        let cell = OnceCell::default();
-
-        let barrier = tokio::sync::Barrier::new(2);
-
-        let initializer = cell.get_or_init(|| async {
-            barrier.wait().await;
-            futures::future::pending::<()>().await;
-
-            Ok::<_, Infallible>("never reached")
-        });
-
-        tokio::select! {
-            _ = initializer => { unreachable!("cannot complete; stuck in pending().await") },
-            _ = barrier.wait() => {}
-        };
-
-        // now initializer is dropped
-
-        assert!(cell.get().is_none());
-
-        let g = cell
-            .get_or_init(|| async { Ok::<_, Infallible>("now initialized") })
-            .await
-            .unwrap();
-        assert_eq!(*g, "now initialized");
-    }
-}
--- a/libs/vm_monitor/README.md
+++ b/libs/vm_monitor/README.md
@@ -27,8 +27,8 @@ and old one if it exists.
 * the filecache: a struct that allows communication with the Postgres file cache.
 On startup, we connect to the filecache and hold on to the connection for the
 entire monitor lifetime.
-* the cgroup watcher: the `CgroupWatcher` polls the `neon-postgres` cgroup's memory
-usage and sends rolling aggregates to the runner.
+* the cgroup watcher: the `CgroupWatcher` manages the `neon-postgres` cgroup by
+listening for `memory.high` events and setting its `memory.{high,max}` values.
 * the runner: the runner marries the filecache and cgroup watcher together,
 communicating with the agent throught the `Dispatcher`, and then calling filecache
 and cgroup watcher functions as needed to upscale and downscale
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -1,38 +1,161 @@
-use std::fmt::{self, Debug, Formatter};
-use std::time::{Duration, Instant};
-
-use anyhow::{anyhow, Context};
-use cgroups_rs::{
-    hierarchies::{self, is_cgroup2_unified_mode},
-    memory::MemController,
-    Subsystem,
+use std::{
+    fmt::{Debug, Display},
+    fs,
+    pin::pin,
+    sync::atomic::{AtomicU64, Ordering},
 };
-use tokio::sync::watch;
+
+use anyhow::{anyhow, bail, Context};
+use cgroups_rs::{
+    freezer::FreezerController,
+    hierarchies::{self, is_cgroup2_unified_mode, UNIFIED_MOUNTPOINT},
+    memory::MemController,
+    MaxValue,
+    Subsystem::{Freezer, Mem},
+};
+use inotify::{EventStream, Inotify, WatchMask};
+use tokio::sync::mpsc::{self, error::TryRecvError};
+use tokio::time::{Duration, Instant};
+use tokio_stream::{Stream, StreamExt};
 use tracing::{info, warn};

+use crate::protocol::Resources;
+use crate::MiB;
+
+/// Monotonically increasing counter of the number of memory.high events
+/// the cgroup has experienced.
+///
+/// We use this to determine if a modification to the `memory.events` file actually
+/// changed the `high` field. If not, we don't care about the change. When we
+/// read the file, we check the `high` field in the file against `MEMORY_EVENT_COUNT`
+/// to see if it changed since last time.
+pub static MEMORY_EVENT_COUNT: AtomicU64 = AtomicU64::new(0);
+
+/// Monotonically increasing counter that gives each cgroup event a unique id.
+///
+/// This allows us to answer questions like "did this upscale arrive before this
+/// memory.high?". This static is also used by the `Sequenced` type to "tag" values
+/// with a sequence number. As such, prefer to used the `Sequenced` type rather
+/// than this static directly.
+static EVENT_SEQUENCE_NUMBER: AtomicU64 = AtomicU64::new(0);
+
+/// A memory event type reported in memory.events.
+#[derive(Debug, Eq, PartialEq, Copy, Clone)]
+pub enum MemoryEvent {
+    Low,
+    High,
+    Max,
+    Oom,
+    OomKill,
+    OomGroupKill,
+}
+
+impl MemoryEvent {
+    fn as_str(&self) -> &str {
+        match self {
+            MemoryEvent::Low => "low",
+            MemoryEvent::High => "high",
+            MemoryEvent::Max => "max",
+            MemoryEvent::Oom => "oom",
+            MemoryEvent::OomKill => "oom_kill",
+            MemoryEvent::OomGroupKill => "oom_group_kill",
+        }
+    }
+}
+
+impl Display for MemoryEvent {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+
 /// Configuration for a `CgroupWatcher`
 #[derive(Debug, Clone)]
 pub struct Config {
-    /// Interval at which we should be fetching memory statistics
-    memory_poll_interval: Duration,
+    // The target difference between the total memory reserved for the cgroup
+    // and the value of the cgroup's memory.high.
+    //
+    // In other words, memory.high + oom_buffer_bytes will equal the total memory that the cgroup may
+    // use (equal to system memory, minus whatever's taken out for the file cache).
+    oom_buffer_bytes: u64,

-    /// The number of samples used in constructing aggregated memory statistics
-    memory_history_len: usize,
-    /// The number of most recent samples that will be periodically logged.
-    ///
-    /// Each sample is logged exactly once. Increasing this value means that recent samples will be
-    /// logged less frequently, and vice versa.
-    ///
-    /// For simplicity, this value must be greater than or equal to `memory_history_len`.
-    memory_history_log_interval: usize,
+    // The amount of memory, in bytes, below a proposed new value for
+    // memory.high that the cgroup's memory usage must be for us to downscale
+    //
+    // In other words, we can downscale only when:
+    //
+    //   memory.current + memory_high_buffer_bytes < (proposed) memory.high
+    //
+    // TODO: there's some minor issues with this approach -- in particular, that we might have
+    // memory in use by the kernel's page cache that we're actually ok with getting rid of.
+    pub(crate) memory_high_buffer_bytes: u64,
+
+    // The maximum duration, in milliseconds, that we're allowed to pause
+    // the cgroup for while waiting for the autoscaler-agent to upscale us
+    max_upscale_wait: Duration,
+
+    // The required minimum time, in milliseconds, that we must wait before re-freezing
+    // the cgroup while waiting for the autoscaler-agent to upscale us.
+    do_not_freeze_more_often_than: Duration,
+
+    // The amount of memory, in bytes, that we should periodically increase memory.high
+    // by while waiting for the autoscaler-agent to upscale us.
+    //
+    // This exists to avoid the excessive throttling that happens when a cgroup is above its
+    // memory.high for too long. See more here:
+    // https://github.com/neondatabase/autoscaling/issues/44#issuecomment-1522487217
+    memory_high_increase_by_bytes: u64,
+
+    // The period, in milliseconds, at which we should repeatedly increase the value
+    // of the cgroup's memory.high while we're waiting on upscaling and memory.high
+    // is still being hit.
+    //
+    // Technically speaking, this actually serves as a rate limit to moderate responding to
+    // memory.high events, but these are roughly equivalent if the process is still allocating
+    // memory.
+    memory_high_increase_every: Duration,
+}
+
+impl Config {
+    /// Calculate the new value for the cgroups memory.high based on system memory
+    pub fn calculate_memory_high_value(&self, total_system_mem: u64) -> u64 {
+        total_system_mem.saturating_sub(self.oom_buffer_bytes)
+    }
 }

 impl Default for Config {
    fn default() -> Self {
        Self {
-            memory_poll_interval: Duration::from_millis(100),
-            memory_history_len: 5, // use 500ms of history for decision-making
-            memory_history_log_interval: 20, // but only log every ~2s (otherwise it's spammy)
+            oom_buffer_bytes: 100 * MiB,
+            memory_high_buffer_bytes: 100 * MiB,
+            // while waiting for upscale, don't freeze for more than 20ms every 1s
+            max_upscale_wait: Duration::from_millis(20),
+            do_not_freeze_more_often_than: Duration::from_millis(1000),
+            // while waiting for upscale, increase memory.high by 10MiB every 25ms
+            memory_high_increase_by_bytes: 10 * MiB,
+            memory_high_increase_every: Duration::from_millis(25),
+        }
+    }
+}
+
+/// Used to represent data that is associated with a certain point in time, such
+/// as an upscale request or memory.high event.
+///
+/// Internally, creating a `Sequenced` uses a static atomic counter to obtain
+/// a unique sequence number. Sequence numbers are monotonically increasing,
+/// allowing us to answer questions like "did this upscale happen after this
+/// memory.high event?" by comparing the sequence numbers of the two events.
+#[derive(Debug, Clone)]
+pub struct Sequenced<T> {
+    seqnum: u64,
+    data: T,
+}
+
+impl<T> Sequenced<T> {
+    pub fn new(data: T) -> Self {
+        Self {
+            seqnum: EVENT_SEQUENCE_NUMBER.fetch_add(1, Ordering::AcqRel),
+            data,
        }
    }
 }
@@ -47,14 +170,74 @@ impl Default for Config {
 pub struct CgroupWatcher {
    pub config: Config,

+    /// The sequence number of the last upscale.
+    ///
+    /// If we receive a memory.high event that has a _lower_ sequence number than
+    /// `last_upscale_seqnum`, then we know it occured before the upscale, and we
+    /// can safely ignore it.
+    ///
+    /// Note: Like the `events` field, this doesn't _need_ interior mutability but we
+    /// use it anyways so that methods take `&self`, not `&mut self`.
+    last_upscale_seqnum: AtomicU64,
+
+    /// A channel on which we send messages to request upscale from the dispatcher.
+    upscale_requester: mpsc::Sender<()>,
+
    /// The actual cgroup we are watching and managing.
    cgroup: cgroups_rs::Cgroup,
 }

+/// Read memory.events for the desired event type.
+///
+/// `path` specifies the path to the desired `memory.events` file.
+/// For more info, see the `memory.events` section of the [kernel docs]
+/// <https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files>
+fn get_event_count(path: &str, event: MemoryEvent) -> anyhow::Result<u64> {
+    let contents = fs::read_to_string(path)
+        .with_context(|| format!("failed to read memory.events from {path}"))?;
+
+    // Then contents of the file look like:
+    // low 42
+    // high 101
+    // ...
+    contents
+        .lines()
+        .filter_map(|s| s.split_once(' '))
+        .find(|(e, _)| *e == event.as_str())
+        .ok_or_else(|| anyhow!("failed to find entry for memory.{event} events in {path}"))
+        .and_then(|(_, count)| {
+            count
+                .parse::<u64>()
+                .with_context(|| format!("failed to parse memory.{event} as u64"))
+        })
+}
+
+/// Create an event stream that produces events whenever the file at the provided
+/// path is modified.
+fn create_file_watcher(path: &str) -> anyhow::Result<EventStream<[u8; 1024]>> {
+    info!("creating file watcher for {path}");
+    let inotify = Inotify::init().context("failed to initialize file watcher")?;
+    inotify
+        .watches()
+        .add(path, WatchMask::MODIFY)
+        .with_context(|| format!("failed to start watching {path}"))?;
+    inotify
+        // The inotify docs use [0u8; 1024] so we'll just copy them. We only need
+        // to store one event at a time - if the event gets written over, that's
+        // ok. We still see that there is an event. For more information, see:
+        // https://man7.org/linux/man-pages/man7/inotify.7.html
+        .into_event_stream([0u8; 1024])
+        .context("failed to start inotify event stream")
+}
+
 impl CgroupWatcher {
    /// Create a new `CgroupWatcher`.
    #[tracing::instrument(skip_all, fields(%name))]
-    pub fn new(name: String) -> anyhow::Result<Self> {
+    pub fn new(
+        name: String,
+        // A channel on which to send upscale requests
+        upscale_requester: mpsc::Sender<()>,
+    ) -> anyhow::Result<(Self, impl Stream<Item = Sequenced<u64>>)> {
        // TODO: clarify exactly why we need v2
        // Make sure cgroups v2 (aka unified) are supported
        if !is_cgroup2_unified_mode() {
@@ -62,203 +245,410 @@ impl CgroupWatcher {
        }
        let cgroup = cgroups_rs::Cgroup::load(hierarchies::auto(), &name);

-        Ok(Self {
-            cgroup,
-            config: Default::default(),
-        })
+        // Start monitoring the cgroup for memory events. In general, for
+        // cgroups v2 (aka unified), metrics are reported in files like
+        // > `/sys/fs/cgroup/{name}/{metric}`
+        // We are looking for `memory.high` events, which are stored in the
+        // file `memory.events`. For more info, see the `memory.events` section
+        // of https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files
+        let path = format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name);
+        let memory_events = create_file_watcher(&path)
+            .with_context(|| format!("failed to create event watcher for {path}"))?
+            // This would be nice with with .inspect_err followed by .ok
+            .filter_map(move |_| match get_event_count(&path, MemoryEvent::High) {
+                Ok(high) => Some(high),
+                Err(error) => {
+                    // TODO: Might want to just panic here
+                    warn!(?error, "failed to read high events count from {}", &path);
+                    None
+                }
+            })
+            // Only report the event if the memory.high count increased
+            .filter_map(|high| {
+                if MEMORY_EVENT_COUNT.fetch_max(high, Ordering::AcqRel) < high {
+                    Some(high)
+                } else {
+                    None
+                }
+            })
+            .map(Sequenced::new);
+
+        let initial_count = get_event_count(
+            &format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name),
+            MemoryEvent::High,
+        )?;
+
+        info!(initial_count, "initial memory.high event count");
+
+        // Hard update `MEMORY_EVENT_COUNT` since there could have been processes
+        // running in the cgroup before that caused it to be non-zero.
+        MEMORY_EVENT_COUNT.fetch_max(initial_count, Ordering::AcqRel);
+
+        Ok((
+            Self {
+                cgroup,
+                upscale_requester,
+                last_upscale_seqnum: AtomicU64::new(0),
+                config: Default::default(),
+            },
+            memory_events,
+        ))
    }

    /// The entrypoint for the `CgroupWatcher`.
    #[tracing::instrument(skip_all)]
-    pub async fn watch(
+    pub async fn watch<E>(
        &self,
-        updates: watch::Sender<(Instant, MemoryHistory)>,
-    ) -> anyhow::Result<()> {
-        // this requirement makes the code a bit easier to work with; see the config for more.
-        assert!(self.config.memory_history_len <= self.config.memory_history_log_interval);
+        // These are ~dependency injected~ (fancy, I know) because this function
+        // should never return.
+        // -> therefore: when we tokio::spawn it, we don't await the JoinHandle.
+        // -> therefore: if we want to stick it in an Arc so many threads can access
+        //    it, methods can never take mutable access.
+        //     - note: we use the Arc strategy so that a) we can call this function
+        //             right here and b) the runner can call the set/get_memory methods
+        // -> since calling recv() on a tokio::sync::mpsc::Receiver takes &mut self,
+        //    we just pass them in here instead of holding them in fields, as that
+        //    would require this method to take &mut self.
+        mut upscales: mpsc::Receiver<Sequenced<Resources>>,
+        events: E,
+    ) -> anyhow::Result<()>
+    where
+        E: Stream<Item = Sequenced<u64>>,
+    {
+        let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO));
+        let mut last_memory_high_increase_at: Option<Instant> = None;
+        let mut events = pin!(events);

-        let mut ticker = tokio::time::interval(self.config.memory_poll_interval);
-        ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
-        // ticker.reset_immediately(); // FIXME: enable this once updating to tokio >= 1.30.0
+        // Are we waiting to be upscaled? Could be true if we request upscale due
+        // to a memory.high event and it does not arrive in time.
+        let mut waiting_on_upscale = false;

-        let mem_controller = self.memory()?;
+        loop {
+            tokio::select! {
+                upscale = upscales.recv() => {
+                    let Sequenced { seqnum, data } = upscale
+                        .context("failed to listen on upscale notification channel")?;
+                    waiting_on_upscale = false;
+                    last_memory_high_increase_at = None;
+                    self.last_upscale_seqnum.store(seqnum, Ordering::Release);
+                    info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
+                }
+                event = events.next() => {
+                    let Some(Sequenced { seqnum, .. }) = event else {
+                        bail!("failed to listen for memory.high events")
+                    };
+                    // The memory.high came before our last upscale, so we consider
+                    // it resolved
+                    if self.last_upscale_seqnum.fetch_max(seqnum, Ordering::AcqRel) > seqnum {
+                        info!(
+                            "received memory.high event, but it came before our last upscale -> ignoring it"
+                        );
+                        continue;
+                    }

-        // buffer for samples that will be logged. once full, it remains so.
-        let history_log_len = self.config.memory_history_log_interval;
-        let mut history_log_buf = vec![MemoryStatus::zeroed(); history_log_len];
+                    // The memory.high came after our latest upscale. We don't
+                    // want to do anything yet, so peek the next event in hopes
+                    // that it's an upscale.
+                    if let Some(upscale_num) = self
+                        .upscaled(&mut upscales)
+                        .context("failed to check if we were upscaled")?
+                    {
+                        if upscale_num > seqnum {
+                            info!(
+                                "received memory.high event, but it came before our last upscale -> ignoring it"
+                            );
+                            continue;
+                        }
+                    }

-        for t in 0_u64.. {
-            ticker.tick().await;
+                    // If it's been long enough since we last froze, freeze the
+                    // cgroup and request upscale
+                    if wait_to_freeze.is_elapsed() {
+                        info!("received memory.high event -> requesting upscale");
+                        waiting_on_upscale = self
+                            .handle_memory_high_event(&mut upscales)
+                            .await
+                            .context("failed to handle upscale")?;
+                        wait_to_freeze
+                            .as_mut()
+                            .reset(Instant::now() + self.config.do_not_freeze_more_often_than);
+                        continue;
+                    }

-            let now = Instant::now();
-            let mem = Self::memory_usage(mem_controller);
+                    // Ok, we can't freeze, just request upscale
+                    if !waiting_on_upscale {
+                        info!("received memory.high event, but too soon to refreeze -> requesting upscale");

-            let i = t as usize % history_log_len;
-            history_log_buf[i] = mem;
+                        // Make check to make sure we haven't been upscaled in the
+                        // meantine (can happen if the agent independently decides
+                        // to upscale us again)
+                        if self
+                            .upscaled(&mut upscales)
+                            .context("failed to check if we were upscaled")?
+                            .is_some()
+                        {
+                            info!("no need to request upscaling because we got upscaled");
+                            continue;
+                        }
+                        self.upscale_requester
+                            .send(())
+                            .await
+                            .context("failed to request upscale")?;
+                        waiting_on_upscale = true;
+                        continue;
+                    }

-            // We're taking *at most* memory_history_len values; we may be bounded by the total
-            // number of samples that have come in so far.
-            let samples_count = (t + 1).min(self.config.memory_history_len as u64) as usize;
-            // NB: in `ring_buf_recent_values_iter`, `i` is *inclusive*, which matches the fact
-            // that we just inserted a value there, so the end of the iterator will *include* the
-            // value at i, rather than stopping just short of it.
-            let samples = ring_buf_recent_values_iter(&history_log_buf, i, samples_count);
+                    // Shoot, we can't freeze or and we're still waiting on upscale,
+                    // increase memory.high to reduce throttling
+                    let can_increase_memory_high = match last_memory_high_increase_at {
+                        None => true,
+                        Some(t) => t.elapsed() > self.config.memory_high_increase_every,
+                    };
+                    if can_increase_memory_high {
+                        info!(
+                            "received memory.high event, \
+                            but too soon to refreeze and already requested upscale \
+                            -> increasing memory.high"
+                        );

-            let summary = MemoryHistory {
-                avg_non_reclaimable: samples.map(|h| h.non_reclaimable).sum::<u64>()
-                    / samples_count as u64,
-                samples_count,
-                samples_span: self.config.memory_poll_interval * (samples_count - 1) as u32,
+                        // Make check to make sure we haven't been upscaled in the
+                        // meantine (can happen if the agent independently decides
+                        // to upscale us again)
+                        if self
+                            .upscaled(&mut upscales)
+                            .context("failed to check if we were upscaled")?
+                            .is_some()
+                        {
+                            info!("no need to increase memory.high because got upscaled");
+                            continue;
+                        }
+
+                        // Request upscale anyways (the agent will handle deduplicating
+                        // requests)
+                        self.upscale_requester
+                            .send(())
+                            .await
+                            .context("failed to request upscale")?;
+
+                        let memory_high =
+                            self.get_memory_high_bytes().context("failed to get memory.high")?;
+                        let new_high = memory_high + self.config.memory_high_increase_by_bytes;
+                        info!(
+                            current_high_bytes = memory_high,
+                            new_high_bytes = new_high,
+                            "updating memory.high"
+                        );
+                        self.set_memory_high_bytes(new_high)
+                            .context("failed to set memory.high")?;
+                        last_memory_high_increase_at = Some(Instant::now());
+                        continue;
+                    }
+
+                    info!("received memory.high event, but can't do anything");
+                }
            };
-
-            // Log the current history if it's time to do so. Because `history_log_buf` has length
-            // equal to the logging interval, we can just log the entire buffer every time we set
-            // the last entry, which also means that for this log line, we can ignore that it's a
-            // ring buffer (because all the entries are in order of increasing time).
-            if i == history_log_len - 1 {
-                info!(
-                    history = ?MemoryStatus::debug_slice(&history_log_buf),
-                    summary = ?summary,
-                    "Recent cgroup memory statistics history"
-                );
-            }
-
-            updates
-                .send((now, summary))
-                .context("failed to send MemoryHistory")?;
        }
+    }

-        unreachable!()
+    /// Handle a `memory.high`, returning whether we are still waiting on upscale
+    /// by the time the function returns.
+    ///
+    /// The general plan for handling a `memory.high` event is as follows:
+    /// 1. Freeze the cgroup
+    /// 2. Start a timer for `self.config.max_upscale_wait`
+    /// 3. Request upscale
+    /// 4. After the timer elapses or we receive upscale, thaw the cgroup.
+    /// 5. Return whether or not we are still waiting for upscale. If we are,
+    ///    we'll increase the cgroups memory.high to avoid getting oom killed
+    #[tracing::instrument(skip_all)]
+    async fn handle_memory_high_event(
+        &self,
+        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+    ) -> anyhow::Result<bool> {
+        // Immediately freeze the cgroup before doing anything else.
+        info!("received memory.high event -> freezing cgroup");
+        self.freeze().context("failed to freeze cgroup")?;
+
+        // We'll use this for logging durations
+        let start_time = Instant::now();
+
+        // Await the upscale until we have to unfreeze
+        let timed =
+            tokio::time::timeout(self.config.max_upscale_wait, self.await_upscale(upscales));
+
+        // Request the upscale
+        info!(
+            wait = ?self.config.max_upscale_wait,
+            "sending request for immediate upscaling",
+        );
+        self.upscale_requester
+            .send(())
+            .await
+            .context("failed to request upscale")?;
+
+        let waiting_on_upscale = match timed.await {
+            Ok(Ok(())) => {
+                info!(elapsed = ?start_time.elapsed(), "received upscale in time");
+                false
+            }
+            // **important**: unfreeze the cgroup before ?-reporting the error
+            Ok(Err(e)) => {
+                info!("error waiting for upscale -> thawing cgroup");
+                self.thaw()
+                    .context("failed to thaw cgroup after errored waiting for upscale")?;
+                Err(e.context("failed to await upscale"))?
+            }
+            Err(_) => {
+                info!(elapsed = ?self.config.max_upscale_wait, "timed out waiting for upscale");
+                true
+            }
+        };
+
+        info!("thawing cgroup");
+        self.thaw().context("failed to thaw cgroup")?;
+
+        Ok(waiting_on_upscale)
+    }
+
+    /// Checks whether we were just upscaled, returning the upscale's sequence
+    /// number if so.
+    #[tracing::instrument(skip_all)]
+    fn upscaled(
+        &self,
+        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+    ) -> anyhow::Result<Option<u64>> {
+        let Sequenced { seqnum, data } = match upscales.try_recv() {
+            Ok(upscale) => upscale,
+            Err(TryRecvError::Empty) => return Ok(None),
+            Err(TryRecvError::Disconnected) => {
+                bail!("upscale notification channel was disconnected")
+            }
+        };
+
+        // Make sure to update the last upscale sequence number
+        self.last_upscale_seqnum.store(seqnum, Ordering::Release);
+        info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
+        Ok(Some(seqnum))
+    }
+
+    /// Await an upscale event, discarding any `memory.high` events received in
+    /// the process.
+    ///
+    /// This is used in `handle_memory_high_event`, where we need to listen
+    /// for upscales in particular so we know if we can thaw the cgroup early.
+    #[tracing::instrument(skip_all)]
+    async fn await_upscale(
+        &self,
+        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+    ) -> anyhow::Result<()> {
+        let Sequenced { seqnum, .. } = upscales
+            .recv()
+            .await
+            .context("error listening for upscales")?;
+
+        self.last_upscale_seqnum.store(seqnum, Ordering::Release);
+        Ok(())
+    }
+
+    /// Get the cgroup's name.
+    pub fn path(&self) -> &str {
+        self.cgroup.path()
+    }
+}
+
+// Methods for manipulating the actual cgroup
+impl CgroupWatcher {
+    /// Get a handle on the freezer subsystem.
+    fn freezer(&self) -> anyhow::Result<&FreezerController> {
+        if let Some(Freezer(freezer)) = self
+            .cgroup
+            .subsystems()
+            .iter()
+            .find(|sub| matches!(sub, Freezer(_)))
+        {
+            Ok(freezer)
+        } else {
+            anyhow::bail!("could not find freezer subsystem")
+        }
+    }
+
+    /// Attempt to freeze the cgroup.
+    pub fn freeze(&self) -> anyhow::Result<()> {
+        self.freezer()
+            .context("failed to get freezer subsystem")?
+            .freeze()
+            .context("failed to freeze")
+    }
+
+    /// Attempt to thaw the cgroup.
+    pub fn thaw(&self) -> anyhow::Result<()> {
+        self.freezer()
+            .context("failed to get freezer subsystem")?
+            .thaw()
+            .context("failed to thaw")
    }

    /// Get a handle on the memory subsystem.
+    ///
+    /// Note: this method does not require `self.memory_update_lock` because
+    /// getting a handle to the subsystem does not access any of the files we
+    /// care about, such as memory.high and memory.events
    fn memory(&self) -> anyhow::Result<&MemController> {
-        self.cgroup
+        if let Some(Mem(memory)) = self
+            .cgroup
            .subsystems()
            .iter()
-            .find_map(|sub| match sub {
-                Subsystem::Mem(c) => Some(c),
-                _ => None,
+            .find(|sub| matches!(sub, Mem(_)))
+        {
+            Ok(memory)
+        } else {
+            anyhow::bail!("could not find memory subsystem")
+        }
+    }
+
+    /// Get cgroup current memory usage.
+    pub fn current_memory_usage(&self) -> anyhow::Result<u64> {
+        Ok(self
+            .memory()
+            .context("failed to get memory subsystem")?
+            .memory_stat()
+            .usage_in_bytes)
+    }
+
+    /// Set cgroup memory.high threshold.
+    pub fn set_memory_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
+        self.set_memory_high_internal(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64))
+    }
+
+    /// Set the cgroup's memory.high to 'max', disabling it.
+    pub fn unset_memory_high(&self) -> anyhow::Result<()> {
+        self.set_memory_high_internal(MaxValue::Max)
+    }
+
+    fn set_memory_high_internal(&self, value: MaxValue) -> anyhow::Result<()> {
+        self.memory()
+            .context("failed to get memory subsystem")?
+            .set_mem(cgroups_rs::memory::SetMemory {
+                low: None,
+                high: Some(value),
+                min: None,
+                max: None,
            })
-            .ok_or_else(|| anyhow!("could not find memory subsystem"))
+            .map_err(anyhow::Error::from)
    }

-    /// Given a handle on the memory subsystem, returns the current memory information
-    fn memory_usage(mem_controller: &MemController) -> MemoryStatus {
-        let stat = mem_controller.memory_stat().stat;
-        MemoryStatus {
-            non_reclaimable: stat.active_anon + stat.inactive_anon,
+    /// Get memory.high threshold.
+    pub fn get_memory_high_bytes(&self) -> anyhow::Result<u64> {
+        let high = self
+            .memory()
+            .context("failed to get memory subsystem while getting memory statistics")?
+            .get_mem()
+            .map(|mem| mem.high)
+            .context("failed to get memory statistics from subsystem")?;
+        match high {
+            Some(MaxValue::Max) => Ok(i64::MAX as u64),
+            Some(MaxValue::Value(high)) => Ok(high as u64),
+            None => anyhow::bail!("failed to read memory.high from memory subsystem"),
        }
    }
 }
-
-// Helper function for `CgroupWatcher::watch`
-fn ring_buf_recent_values_iter<T>(
-    buf: &[T],
-    last_value_idx: usize,
-    count: usize,
-) -> impl '_ + Iterator<Item = &T> {
-    // Assertion carried over from `CgroupWatcher::watch`, to make the logic in this function
-    // easier (we only have to add `buf.len()` once, rather than a dynamic number of times).
-    assert!(count <= buf.len());
-
-    buf.iter()
-        // 'cycle' because the values could wrap around
-        .cycle()
-        // with 'cycle', this skip is more like 'offset', and functionally this is
-        // offsettting by 'last_value_idx - count (mod buf.len())', but we have to be
-        // careful to avoid underflow, so we pre-add buf.len().
-        // The '+ 1' is because `last_value_idx` is inclusive, rather than exclusive.
-        .skip((buf.len() + last_value_idx + 1 - count) % buf.len())
-        .take(count)
-}
-
-/// Summary of recent memory usage
-#[derive(Debug, Copy, Clone)]
-pub struct MemoryHistory {
-    /// Rolling average of non-reclaimable memory usage samples over the last `history_period`
-    pub avg_non_reclaimable: u64,
-
-    /// The number of samples used to construct this summary
-    pub samples_count: usize,
-    /// Total timespan between the first and last sample used for this summary
-    pub samples_span: Duration,
-}
-
-#[derive(Debug, Copy, Clone)]
-pub struct MemoryStatus {
-    non_reclaimable: u64,
-}
-
-impl MemoryStatus {
-    fn zeroed() -> Self {
-        MemoryStatus { non_reclaimable: 0 }
-    }
-
-    fn debug_slice(slice: &[Self]) -> impl '_ + Debug {
-        struct DS<'a>(&'a [MemoryStatus]);
-
-        impl<'a> Debug for DS<'a> {
-            fn fmt(&self, f: &mut Formatter) -> fmt::Result {
-                f.debug_struct("[MemoryStatus]")
-                    .field(
-                        "non_reclaimable[..]",
-                        &Fields(self.0, |stat: &MemoryStatus| {
-                            BytesToGB(stat.non_reclaimable)
-                        }),
-                    )
-                    .finish()
-            }
-        }
-
-        struct Fields<'a, F>(&'a [MemoryStatus], F);
-
-        impl<'a, F: Fn(&MemoryStatus) -> T, T: Debug> Debug for Fields<'a, F> {
-            fn fmt(&self, f: &mut Formatter) -> fmt::Result {
-                f.debug_list().entries(self.0.iter().map(&self.1)).finish()
-            }
-        }
-
-        struct BytesToGB(u64);
-
-        impl Debug for BytesToGB {
-            fn fmt(&self, f: &mut Formatter) -> fmt::Result {
-                f.write_fmt(format_args!(
-                    "{:.3}Gi",
-                    self.0 as f64 / (1_u64 << 30) as f64
-                ))
-            }
-        }
-
-        DS(slice)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    #[test]
-    fn ring_buf_iter() {
-        let buf = vec![0_i32, 1, 2, 3, 4, 5, 6, 7, 8, 9];
-
-        let values = |offset, count| {
-            super::ring_buf_recent_values_iter(&buf, offset, count)
-                .copied()
-                .collect::<Vec<i32>>()
-        };
-
-        // Boundary conditions: start, end, and entire thing:
-        assert_eq!(values(0, 1), [0]);
-        assert_eq!(values(3, 4), [0, 1, 2, 3]);
-        assert_eq!(values(9, 4), [6, 7, 8, 9]);
-        assert_eq!(values(9, 10), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
-
-        // "normal" operation: no wraparound
-        assert_eq!(values(7, 4), [4, 5, 6, 7]);
-
-        // wraparound:
-        assert_eq!(values(0, 4), [7, 8, 9, 0]);
-        assert_eq!(values(1, 4), [8, 9, 0, 1]);
-        assert_eq!(values(2, 4), [9, 0, 1, 2]);
-        assert_eq!(values(2, 10), [3, 4, 5, 6, 7, 8, 9, 0, 1, 2]);
-    }
-}
--- a/libs/vm_monitor/src/dispatcher.rs
+++ b/libs/vm_monitor/src/dispatcher.rs
@@ -12,10 +12,12 @@ use futures::{
    stream::{SplitSink, SplitStream},
    SinkExt, StreamExt,
 };
+use tokio::sync::mpsc;
 use tracing::info;

+use crate::cgroup::Sequenced;
 use crate::protocol::{
-    OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, PROTOCOL_MAX_VERSION,
+    OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, Resources, PROTOCOL_MAX_VERSION,
    PROTOCOL_MIN_VERSION,
 };

@@ -34,6 +36,13 @@ pub struct Dispatcher {
    /// We send messages to the agent through `sink`
    sink: SplitSink<WebSocket, Message>,

+    /// Used to notify the cgroup when we are upscaled.
+    pub(crate) notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
+
+    /// When the cgroup requests upscale it will send on this channel. In response
+    /// we send an `UpscaleRequst` to the agent.
+    pub(crate) request_upscale_events: mpsc::Receiver<()>,
+
    /// The protocol version we have agreed to use with the agent. This is negotiated
    /// during the creation of the dispatcher, and should be the highest shared protocol
    /// version.
@@ -52,7 +61,11 @@ impl Dispatcher {
    /// 1. Wait for the agent to sent the range of protocols it supports.
    /// 2. Send a protocol version that works for us as well, or an error if there
    ///    is no compatible version.
-    pub async fn new(stream: WebSocket) -> anyhow::Result<Self> {
+    pub async fn new(
+        stream: WebSocket,
+        notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
+        request_upscale_events: mpsc::Receiver<()>,
+    ) -> anyhow::Result<Self> {
        let (mut sink, mut source) = stream.split();

        // Figure out the highest protocol version we both support
@@ -106,10 +119,22 @@ impl Dispatcher {
        Ok(Self {
            sink,
            source,
+            notify_upscale_events,
+            request_upscale_events,
            proto_version: highest_shared_version,
        })
    }

+    /// Notify the cgroup manager that we have received upscale and wait for
+    /// the acknowledgement.
+    #[tracing::instrument(skip_all, fields(?resources))]
+    pub async fn notify_upscale(&self, resources: Sequenced<Resources>) -> anyhow::Result<()> {
+        self.notify_upscale_events
+            .send(resources)
+            .await
+            .context("failed to send resources and oneshot sender across channel")
+    }
+
    /// Send a message to the agent.
    ///
    /// Although this function is small, it has one major benefit: it is the only
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -5,16 +5,18 @@
 //! all functionality.

 use std::fmt::Debug;
+use std::sync::Arc;
 use std::time::{Duration, Instant};

 use anyhow::{bail, Context};
 use axum::extract::ws::{Message, WebSocket};
 use futures::StreamExt;
-use tokio::sync::{broadcast, watch};
+use tokio::sync::broadcast;
+use tokio::sync::mpsc;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn};

-use crate::cgroup::{self, CgroupWatcher};
+use crate::cgroup::{CgroupWatcher, Sequenced};
 use crate::dispatcher::Dispatcher;
 use crate::filecache::{FileCacheConfig, FileCacheState};
 use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
@@ -26,7 +28,7 @@ use crate::{bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel, Args
 pub struct Runner {
    config: Config,
    filecache: Option<FileCacheState>,
-    cgroup: Option<CgroupState>,
+    cgroup: Option<Arc<CgroupWatcher>>,
    dispatcher: Dispatcher,

    /// We "mint" new message ids by incrementing this counter and taking the value.
@@ -43,14 +45,6 @@ pub struct Runner {
    kill: broadcast::Receiver<()>,
 }

-#[derive(Debug)]
-struct CgroupState {
-    watcher: watch::Receiver<(Instant, cgroup::MemoryHistory)>,
-    /// If [`cgroup::MemoryHistory::avg_non_reclaimable`] exceeds `threshold`, we send upscale
-    /// requests.
-    threshold: u64,
-}
-
 /// Configuration for a `Runner`
 #[derive(Debug)]
 pub struct Config {
@@ -68,56 +62,16 @@ pub struct Config {
    /// upscale resource amounts (because we might not *actually* have been upscaled yet). This field
    /// should be removed once we have a better solution there.
    sys_buffer_bytes: u64,
-
-    /// Minimum fraction of total system memory reserved *before* the the cgroup threshold; in
-    /// other words, providing a ceiling for the highest value of the threshold by enforcing that
-    /// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the
-    /// threshold.
-    ///
-    /// For example, a value of `0.1` means that 10% of total memory must remain after exceeding
-    /// the threshold, so the value of the cgroup threshold would always be capped at 90% of total
-    /// memory.
-    ///
-    /// The default value of `0.15` means that we *guarantee* sending upscale requests if the
-    /// cgroup is using more than 85% of total memory (even if we're *not* separately reserving
-    /// memory for the file cache).
-    cgroup_min_overhead_fraction: f64,
-
-    cgroup_downscale_threshold_buffer_bytes: u64,
 }

 impl Default for Config {
    fn default() -> Self {
        Self {
            sys_buffer_bytes: 100 * MiB,
-            cgroup_min_overhead_fraction: 0.15,
-            cgroup_downscale_threshold_buffer_bytes: 100 * MiB,
        }
    }
 }

-impl Config {
-    fn cgroup_threshold(&self, total_mem: u64, file_cache_disk_size: u64) -> u64 {
-        // If the file cache is in tmpfs, then it will count towards shmem usage of the cgroup,
-        // and thus be non-reclaimable, so we should allow for additional memory usage.
-        //
-        // If the file cache sits on disk, our desired stable system state is for it to be fully
-        // page cached (its contents should only be paged to/from disk in situations where we can't
-        // upscale fast enough). Page-cached memory is reclaimable, so we need to lower the
-        // threshold for non-reclaimable memory so we scale up *before* the kernel starts paging
-        // out the file cache.
-        let memory_remaining_for_cgroup = total_mem.saturating_sub(file_cache_disk_size);
-
-        // Even if we're not separately making room for the file cache (if it's in tmpfs), we still
-        // want our threshold to be met gracefully instead of letting postgres get OOM-killed.
-        // So we guarantee that there's at least `cgroup_min_overhead_fraction` of total memory
-        // remaining above the threshold.
-        let max_threshold = (total_mem as f64 * (1.0 - self.cgroup_min_overhead_fraction)) as u64;
-
-        memory_remaining_for_cgroup.min(max_threshold)
-    }
-}
-
 impl Runner {
    /// Create a new monitor.
    #[tracing::instrument(skip_all, fields(?config, ?args))]
@@ -133,7 +87,12 @@ impl Runner {
            "invalid monitor Config: sys_buffer_bytes cannot be 0"
        );

-        let dispatcher = Dispatcher::new(ws)
+        // *NOTE*: the dispatcher and cgroup manager talk through these channels
+        // so make sure they each get the correct half, nothing is droppped, etc.
+        let (notified_send, notified_recv) = mpsc::channel(1);
+        let (requesting_send, requesting_recv) = mpsc::channel(1);
+
+        let dispatcher = Dispatcher::new(ws, notified_send, requesting_recv)
            .await
            .context("error creating new dispatcher")?;

@@ -147,9 +106,45 @@ impl Runner {
            kill,
        };

-        let mem = get_total_system_memory();
+        // If we have both the cgroup and file cache integrations enabled, it's possible for
+        // temporary failures to result in cgroup throttling (from memory.high), that in turn makes
+        // it near-impossible to connect to the file cache (because it times out). Unfortunately,
+        // we *do* still want to determine the file cache size before setting the cgroup's
+        // memory.high, so it's not as simple as just swapping the order.
+        //
+        // Instead, the resolution here is that on vm-monitor startup (note: happens on each
+        // connection from autoscaler-agent, possibly multiple times per compute_ctl lifecycle), we
+        // temporarily unset memory.high, to allow any existing throttling to dissipate. It's a bit
+        // of a hacky solution, but helps with reliability.
+        if let Some(name) = &args.cgroup {
+            // Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
+            // now, and then set limits later.
+            info!("initializing cgroup");

-        let mut file_cache_disk_size = 0;
+            let (cgroup, cgroup_event_stream) = CgroupWatcher::new(name.clone(), requesting_send)
+                .context("failed to create cgroup manager")?;
+
+            info!("temporarily unsetting memory.high");
+
+            // Temporarily un-set cgroup memory.high; see above.
+            cgroup
+                .unset_memory_high()
+                .context("failed to unset memory.high")?;
+
+            let cgroup = Arc::new(cgroup);
+
+            let cgroup_clone = Arc::clone(&cgroup);
+            spawn_with_cancel(
+                token.clone(),
+                |_| error!("cgroup watcher terminated"),
+                async move { cgroup_clone.watch(notified_recv, cgroup_event_stream).await },
+            );
+
+            state.cgroup = Some(cgroup);
+        }
+
+        let mut file_cache_reserved_bytes = 0;
+        let mem = get_total_system_memory();

        // We need to process file cache initialization before cgroup initialization, so that the memory
        // allocated to the file cache is appropriately taken into account when we decide the cgroup's
@@ -161,7 +156,7 @@ impl Runner {
                false => FileCacheConfig::default_in_memory(),
            };

-            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
+            let mut file_cache = FileCacheState::new(connstr, config, token)
                .await
                .context("failed to create file cache")?;

@@ -186,40 +181,23 @@ impl Runner {
            if actual_size != new_size {
                info!("file cache size actually got set to {actual_size}")
            }
-
-            if args.file_cache_on_disk {
-                file_cache_disk_size = actual_size;
+            // Mark the resources given to the file cache as reserved, but only if it's in memory.
+            if !args.file_cache_on_disk {
+                file_cache_reserved_bytes = actual_size;
            }

            state.filecache = Some(file_cache);
        }

-        if let Some(name) = &args.cgroup {
-            // Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
-            // now, and then set limits later.
-            info!("initializing cgroup");
+        if let Some(cgroup) = &state.cgroup {
+            let available = mem - file_cache_reserved_bytes;
+            let value = cgroup.config.calculate_memory_high_value(available);

-            let cgroup =
-                CgroupWatcher::new(name.clone()).context("failed to create cgroup manager")?;
+            info!(value, "setting memory.high");

-            let init_value = cgroup::MemoryHistory {
-                avg_non_reclaimable: 0,
-                samples_count: 0,
-                samples_span: Duration::ZERO,
-            };
-            let (hist_tx, hist_rx) = watch::channel((Instant::now(), init_value));
-
-            spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move {
-                cgroup.watch(hist_tx).await
-            });
-
-            let threshold = state.config.cgroup_threshold(mem, file_cache_disk_size);
-            info!(threshold, "set initial cgroup threshold",);
-
-            state.cgroup = Some(CgroupState {
-                watcher: hist_rx,
-                threshold,
-            });
+            cgroup
+                .set_memory_high_bytes(value)
+                .context("failed to set cgroup memory.high")?;
        }

        Ok(state)
@@ -239,51 +217,28 @@ impl Runner {

        let requested_mem = target.mem;
        let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
-        let (expected_file_cache_size, expected_file_cache_disk_size) = self
+        let expected_file_cache_mem_usage = self
            .filecache
            .as_ref()
-            .map(|file_cache| {
-                let size = file_cache.config.calculate_cache_size(usable_system_memory);
-                match file_cache.config.in_memory {
-                    true => (size, 0),
-                    false => (size, size),
-                }
-            })
-            .unwrap_or((0, 0));
+            .map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
+            .unwrap_or(0);
+        let mut new_cgroup_mem_high = 0;
        if let Some(cgroup) = &self.cgroup {
-            let (last_time, last_history) = *cgroup.watcher.borrow();
-
-            // NB: The ordering of these conditions is intentional. During startup, we should deny
-            // downscaling until we have enough information to determine that it's safe to do so
-            // (i.e. enough samples have come in). But if it's been a while and we *still* haven't
-            // received any information, we should *fail* instead of just denying downscaling.
-            //
-            // `last_time` is set to `Instant::now()` on startup, so checking `last_time.elapsed()`
-            // serves double-duty: it trips if we haven't received *any* metrics for long enough,
-            // OR if we haven't received metrics *recently enough*.
-            //
-            // TODO: make the duration here configurable.
-            if last_time.elapsed() > Duration::from_secs(5) {
-                bail!("haven't gotten cgroup memory stats recently enough to determine downscaling information");
-            } else if last_history.samples_count <= 1 {
-                let status = "haven't received enough cgroup memory stats yet";
-                info!(status, "discontinuing downscale");
-                return Ok((false, status.to_owned()));
-            }
-
-            let new_threshold = self
+            new_cgroup_mem_high = cgroup
                .config
-                .cgroup_threshold(usable_system_memory, expected_file_cache_disk_size);
+                .calculate_memory_high_value(usable_system_memory - expected_file_cache_mem_usage);

-            let current = last_history.avg_non_reclaimable;
+            let current = cgroup
+                .current_memory_usage()
+                .context("failed to fetch cgroup memory")?;

-            if new_threshold < current + self.config.cgroup_downscale_threshold_buffer_bytes {
+            if new_cgroup_mem_high < current + cgroup.config.memory_high_buffer_bytes {
                let status = format!(
-                    "{}: {} MiB (new threshold) < {} (current usage) + {} (downscale buffer)",
-                    "calculated memory threshold too low",
-                    bytes_to_mebibytes(new_threshold),
+                    "{}: {} MiB (new high) < {} (current usage) + {} (buffer)",
+                    "calculated memory.high too low",
+                    bytes_to_mebibytes(new_cgroup_mem_high),
                    bytes_to_mebibytes(current),
-                    bytes_to_mebibytes(self.config.cgroup_downscale_threshold_buffer_bytes)
+                    bytes_to_mebibytes(cgroup.config.memory_high_buffer_bytes)
                );

                info!(status, "discontinuing downscale");
@@ -294,14 +249,14 @@ impl Runner {

        // The downscaling has been approved. Downscale the file cache, then the cgroup.
        let mut status = vec![];
-        let mut file_cache_disk_size = 0;
+        let mut file_cache_mem_usage = 0;
        if let Some(file_cache) = &mut self.filecache {
            let actual_usage = file_cache
-                .set_file_cache_size(expected_file_cache_size)
+                .set_file_cache_size(expected_file_cache_mem_usage)
                .await
                .context("failed to set file cache size")?;
-            if !file_cache.config.in_memory {
-                file_cache_disk_size = actual_usage;
+            if file_cache.config.in_memory {
+                file_cache_mem_usage = actual_usage;
            }
            let message = format!(
                "set file cache size to {} MiB (in memory = {})",
@@ -312,18 +267,24 @@ impl Runner {
            status.push(message);
        }

-        if let Some(cgroup) = &mut self.cgroup {
-            let new_threshold = self
-                .config
-                .cgroup_threshold(usable_system_memory, file_cache_disk_size);
+        if let Some(cgroup) = &self.cgroup {
+            let available_memory = usable_system_memory - file_cache_mem_usage;
+
+            if file_cache_mem_usage != expected_file_cache_mem_usage {
+                new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
+            }
+
+            // new_cgroup_mem_high is initialized to 0 but it is guaranteed to not be here
+            // since it is properly initialized in the previous cgroup if let block
+            cgroup
+                .set_memory_high_bytes(new_cgroup_mem_high)
+                .context("failed to set cgroup memory.high")?;

            let message = format!(
-                "set cgroup memory threshold from {} MiB to {} MiB, of new total {} MiB",
-                bytes_to_mebibytes(cgroup.threshold),
-                bytes_to_mebibytes(new_threshold),
-                bytes_to_mebibytes(usable_system_memory)
+                "set cgroup memory.high to {} MiB, of new max {} MiB",
+                bytes_to_mebibytes(new_cgroup_mem_high),
+                bytes_to_mebibytes(available_memory)
            );
-            cgroup.threshold = new_threshold;
            info!("downscale: {message}");
            status.push(message);
        }
@@ -344,7 +305,8 @@ impl Runner {
        let new_mem = resources.mem;
        let usable_system_memory = new_mem.saturating_sub(self.config.sys_buffer_bytes);

-        let mut file_cache_disk_size = 0;
+        // Get the file cache's expected contribution to the memory usage
+        let mut file_cache_mem_usage = 0;
        if let Some(file_cache) = &mut self.filecache {
            let expected_usage = file_cache.config.calculate_cache_size(usable_system_memory);
            info!(
@@ -357,8 +319,8 @@ impl Runner {
                .set_file_cache_size(expected_usage)
                .await
                .context("failed to set file cache size")?;
-            if !file_cache.config.in_memory {
-                file_cache_disk_size = actual_usage;
+            if file_cache.config.in_memory {
+                file_cache_mem_usage = actual_usage;
            }

            if actual_usage != expected_usage {
@@ -370,18 +332,18 @@ impl Runner {
            }
        }

-        if let Some(cgroup) = &mut self.cgroup {
-            let new_threshold = self
-                .config
-                .cgroup_threshold(usable_system_memory, file_cache_disk_size);
-
+        if let Some(cgroup) = &self.cgroup {
+            let available_memory = usable_system_memory - file_cache_mem_usage;
+            let new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
            info!(
-                "set cgroup memory threshold from {} MiB to {} MiB of new total {} MiB",
-                bytes_to_mebibytes(cgroup.threshold),
-                bytes_to_mebibytes(new_threshold),
-                bytes_to_mebibytes(usable_system_memory)
+                target = bytes_to_mebibytes(new_cgroup_mem_high),
+                total = bytes_to_mebibytes(new_mem),
+                name = cgroup.path(),
+                "updating cgroup memory.high",
            );
-            cgroup.threshold = new_threshold;
+            cgroup
+                .set_memory_high_bytes(new_cgroup_mem_high)
+                .context("failed to set cgroup memory.high")?;
        }

        Ok(())
@@ -399,6 +361,10 @@ impl Runner {
                self.handle_upscale(granted)
                    .await
                    .context("failed to handle upscale")?;
+                self.dispatcher
+                    .notify_upscale(Sequenced::new(granted))
+                    .await
+                    .context("failed to notify notify cgroup of upscale")?;
                Ok(Some(OutboundMsg::new(
                    OutboundMsgKind::UpscaleConfirmation {},
                    id,
@@ -442,53 +408,33 @@ impl Runner {
                        Err(e) => bail!("failed to receive kill signal: {e}")
                    }
                }
-
-                // New memory stats from the cgroup, *may* need to request upscaling, if we've
-                // exceeded the threshold
-                result = self.cgroup.as_mut().unwrap().watcher.changed(), if self.cgroup.is_some() => {
-                    result.context("failed to receive from cgroup memory stats watcher")?;
-
-                    let cgroup = self.cgroup.as_ref().unwrap();
-
-                    let (_time, cgroup_mem_stat) = *cgroup.watcher.borrow();
-
-                    // If we haven't exceeded the threshold, then we're all ok
-                    if cgroup_mem_stat.avg_non_reclaimable < cgroup.threshold {
-                        continue;
+                // we need to propagate an upscale request
+                request = self.dispatcher.request_upscale_events.recv(), if self.cgroup.is_some() => {
+                    if request.is_none() {
+                        bail!("failed to listen for upscale event from cgroup")
                    }

-                    // Otherwise, we generally want upscaling. But, if it's been less than 1 second
-                    // since the last time we requested upscaling, ignore the event, to avoid
-                    // spamming the agent.
+                    // If it's been less than 1 second since the last time we requested upscaling,
+                    // ignore the event, to avoid spamming the agent (otherwise, this can happen
+                    // ~1k times per second).
                    if let Some(t) = self.last_upscale_request_at {
                        let elapsed = t.elapsed();
                        if elapsed < Duration::from_secs(1) {
-                            info!(
-                                elapsed_millis = elapsed.as_millis(),
-                                avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable),
-                                threshold = bytes_to_mebibytes(cgroup.threshold),
-                                "cgroup memory stats are high enough to upscale but too soon to forward the request, ignoring",
-                            );
+                            info!(elapsed_millis = elapsed.as_millis(), "cgroup asked for upscale but too soon to forward the request, ignoring");
                            continue;
                        }
                    }

                    self.last_upscale_request_at = Some(Instant::now());

-                    info!(
-                        avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable),
-                        threshold = bytes_to_mebibytes(cgroup.threshold),
-                        "cgroup memory stats are high enough to upscale, requesting upscale",
-                    );
-
+                    info!("cgroup asking for upscale; forwarding request");
                    self.counter += 2; // Increment, preserving parity (i.e. keep the
                                       // counter odd). See the field comment for more.
                    self.dispatcher
                        .send(OutboundMsg::new(OutboundMsgKind::UpscaleRequest {}, self.counter))
                        .await
                        .context("failed to send message")?;
-                },
-
+                }
                // there is a message from the agent
                msg = self.dispatcher.source.next() => {
                    if let Some(msg) = msg {
@@ -516,14 +462,11 @@ impl Runner {
                                    Ok(Some(out)) => out,
                                    Ok(None) => continue,
                                    Err(e) => {
-                                        // use {:#} for our logging because the display impl only
-                                        // gives the outermost cause, and the debug impl
-                                        // pretty-prints the error, whereas {:#} contains all the
-                                        // causes, but is compact (no newlines).
-                                        warn!(error = format!("{e:#}"), "error handling message");
+                                        let error = e.to_string();
+                                        warn!(?error, "error handling message");
                                        OutboundMsg::new(
                                            OutboundMsgKind::InternalError {
-                                                error: e.to_string(),
+                                                error
                                            },
                                            message.id
                                        )
--- a/libs/walproposer/Cargo.toml
+++ b/libs/walproposer/Cargo.toml
@@ -1,16 +0,0 @@
-[package]
-name = "walproposer"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-
-[dependencies]
-anyhow.workspace = true
-utils.workspace = true
-postgres_ffi.workspace = true
-
-workspace_hack.workspace = true
-
-[build-dependencies]
-anyhow.workspace = true
-bindgen.workspace = true
--- a/libs/walproposer/bindgen_deps.h
+++ b/libs/walproposer/bindgen_deps.h
@@ -1 +0,0 @@
-#include "walproposer.h"
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -1,113 +0,0 @@
-use std::{env, path::PathBuf, process::Command};
-
-use anyhow::{anyhow, Context};
-use bindgen::CargoCallbacks;
-
-fn main() -> anyhow::Result<()> {
-    // Tell cargo to invalidate the built crate whenever the wrapper changes
-    println!("cargo:rerun-if-changed=bindgen_deps.h");
-
-    // Finding the location of built libraries and Postgres C headers:
-    // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `<project_root>/pg_install`
-    // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `<project_root>/pg_install/{PG_MAJORVERSION}/include/postgresql/server`
-    let pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") {
-        postgres_install_dir.into()
-    } else {
-        PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../pg_install")
-    };
-
-    let pg_install_abs = std::fs::canonicalize(pg_install_dir)?;
-    let walproposer_lib_dir = pg_install_abs.join("build/walproposer-lib");
-    let walproposer_lib_search_str = walproposer_lib_dir
-        .to_str()
-        .ok_or(anyhow!("Bad non-UTF path"))?;
-
-    let pgxn_neon = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../pgxn/neon");
-    let pgxn_neon = std::fs::canonicalize(pgxn_neon)?;
-    let pgxn_neon = pgxn_neon.to_str().ok_or(anyhow!("Bad non-UTF path"))?;
-
-    println!("cargo:rustc-link-lib=static=pgport");
-    println!("cargo:rustc-link-lib=static=pgcommon");
-    println!("cargo:rustc-link-lib=static=walproposer");
-    println!("cargo:rustc-link-search={walproposer_lib_search_str}");
-
-    let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config");
-    let inc_server_path: String = if pg_config_bin.exists() {
-        let output = Command::new(pg_config_bin)
-            .arg("--includedir-server")
-            .output()
-            .context("failed to execute `pg_config --includedir-server`")?;
-
-        if !output.status.success() {
-            panic!("`pg_config --includedir-server` failed")
-        }
-
-        String::from_utf8(output.stdout)
-            .context("pg_config output is not UTF-8")?
-            .trim_end()
-            .into()
-    } else {
-        let server_path = pg_install_abs
-            .join("v16")
-            .join("include")
-            .join("postgresql")
-            .join("server")
-            .into_os_string();
-        server_path
-            .into_string()
-            .map_err(|s| anyhow!("Bad postgres server path {s:?}"))?
-    };
-
-    // The bindgen::Builder is the main entry point
-    // to bindgen, and lets you build up options for
-    // the resulting bindings.
-    let bindings = bindgen::Builder::default()
-        // The input header we would like to generate
-        // bindings for.
-        .header("bindgen_deps.h")
-        // Tell cargo to invalidate the built crate whenever any of the
-        // included header files changed.
-        .parse_callbacks(Box::new(CargoCallbacks))
-        .allowlist_type("WalProposer")
-        .allowlist_type("WalProposerConfig")
-        .allowlist_type("walproposer_api")
-        .allowlist_function("WalProposerCreate")
-        .allowlist_function("WalProposerStart")
-        .allowlist_function("WalProposerBroadcast")
-        .allowlist_function("WalProposerPoll")
-        .allowlist_function("WalProposerFree")
-        .allowlist_var("DEBUG5")
-        .allowlist_var("DEBUG4")
-        .allowlist_var("DEBUG3")
-        .allowlist_var("DEBUG2")
-        .allowlist_var("DEBUG1")
-        .allowlist_var("LOG")
-        .allowlist_var("INFO")
-        .allowlist_var("NOTICE")
-        .allowlist_var("WARNING")
-        .allowlist_var("ERROR")
-        .allowlist_var("FATAL")
-        .allowlist_var("PANIC")
-        .allowlist_var("WPEVENT")
-        .allowlist_var("WL_LATCH_SET")
-        .allowlist_var("WL_SOCKET_READABLE")
-        .allowlist_var("WL_SOCKET_WRITEABLE")
-        .allowlist_var("WL_TIMEOUT")
-        .allowlist_var("WL_SOCKET_CLOSED")
-        .allowlist_var("WL_SOCKET_MASK")
-        .clang_arg("-DWALPROPOSER_LIB")
-        .clang_arg(format!("-I{pgxn_neon}"))
-        .clang_arg(format!("-I{inc_server_path}"))
-        // Finish the builder and generate the bindings.
-        .generate()
-        // Unwrap the Result and panic on failure.
-        .expect("Unable to generate bindings");
-
-    // Write the bindings to the $OUT_DIR/bindings.rs file.
-    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()).join("bindings.rs");
-    bindings
-        .write_to_file(out_path)
-        .expect("Couldn't write bindings!");
-
-    Ok(())
-}
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -1,455 +0,0 @@
-#![allow(dead_code)]
-
-use std::ffi::CStr;
-use std::ffi::CString;
-
-use crate::bindings::uint32;
-use crate::bindings::walproposer_api;
-use crate::bindings::PGAsyncReadResult;
-use crate::bindings::PGAsyncWriteResult;
-use crate::bindings::Safekeeper;
-use crate::bindings::Size;
-use crate::bindings::StringInfoData;
-use crate::bindings::TimeLineID;
-use crate::bindings::TimestampTz;
-use crate::bindings::WalProposer;
-use crate::bindings::WalProposerConnStatusType;
-use crate::bindings::WalProposerConnectPollStatusType;
-use crate::bindings::WalProposerExecStatusType;
-use crate::bindings::WalproposerShmemState;
-use crate::bindings::XLogRecPtr;
-use crate::walproposer::ApiImpl;
-use crate::walproposer::WaitResult;
-
-extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemState {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).get_shmem_state()
-    }
-}
-
-extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).start_streaming(startpos)
-    }
-}
-
-extern "C" fn get_flush_rec_ptr(wp: *mut WalProposer) -> XLogRecPtr {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).get_flush_rec_ptr()
-    }
-}
-
-extern "C" fn get_current_timestamp(wp: *mut WalProposer) -> TimestampTz {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).get_current_timestamp()
-    }
-}
-
-extern "C" fn conn_error_message(sk: *mut Safekeeper) -> *mut ::std::os::raw::c_char {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        let msg = (*api).conn_error_message(&mut (*sk));
-        let msg = CString::new(msg).unwrap();
-        // TODO: fix leaking error message
-        msg.into_raw()
-    }
-}
-
-extern "C" fn conn_status(sk: *mut Safekeeper) -> WalProposerConnStatusType {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).conn_status(&mut (*sk))
-    }
-}
-
-extern "C" fn conn_connect_start(sk: *mut Safekeeper) {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).conn_connect_start(&mut (*sk))
-    }
-}
-
-extern "C" fn conn_connect_poll(sk: *mut Safekeeper) -> WalProposerConnectPollStatusType {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).conn_connect_poll(&mut (*sk))
-    }
-}
-
-extern "C" fn conn_send_query(sk: *mut Safekeeper, query: *mut ::std::os::raw::c_char) -> bool {
-    let query = unsafe { CStr::from_ptr(query) };
-    let query = query.to_str().unwrap();
-
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).conn_send_query(&mut (*sk), query)
-    }
-}
-
-extern "C" fn conn_get_query_result(sk: *mut Safekeeper) -> WalProposerExecStatusType {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).conn_get_query_result(&mut (*sk))
-    }
-}
-
-extern "C" fn conn_flush(sk: *mut Safekeeper) -> ::std::os::raw::c_int {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).conn_flush(&mut (*sk))
-    }
-}
-
-extern "C" fn conn_finish(sk: *mut Safekeeper) {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).conn_finish(&mut (*sk))
-    }
-}
-
-extern "C" fn conn_async_read(
-    sk: *mut Safekeeper,
-    buf: *mut *mut ::std::os::raw::c_char,
-    amount: *mut ::std::os::raw::c_int,
-) -> PGAsyncReadResult {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        let (res, result) = (*api).conn_async_read(&mut (*sk));
-
-        // This function has guarantee that returned buf will be valid until
-        // the next call. So we can store a Vec in each Safekeeper and reuse
-        // it on the next call.
-        let mut inbuf = take_vec_u8(&mut (*sk).inbuf).unwrap_or_default();
-
-        inbuf.clear();
-        inbuf.extend_from_slice(res);
-
-        // Put a Vec back to sk->inbuf and return data ptr.
-        *buf = store_vec_u8(&mut (*sk).inbuf, inbuf);
-        *amount = res.len() as i32;
-
-        result
-    }
-}
-
-extern "C" fn conn_async_write(
-    sk: *mut Safekeeper,
-    buf: *const ::std::os::raw::c_void,
-    size: usize,
-) -> PGAsyncWriteResult {
-    unsafe {
-        let buf = std::slice::from_raw_parts(buf as *const u8, size);
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).conn_async_write(&mut (*sk), buf)
-    }
-}
-
-extern "C" fn conn_blocking_write(
-    sk: *mut Safekeeper,
-    buf: *const ::std::os::raw::c_void,
-    size: usize,
-) -> bool {
-    unsafe {
-        let buf = std::slice::from_raw_parts(buf as *const u8, size);
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).conn_blocking_write(&mut (*sk), buf)
-    }
-}
-
-extern "C" fn recovery_download(
-    sk: *mut Safekeeper,
-    _timeline: TimeLineID,
-    startpos: XLogRecPtr,
-    endpos: XLogRecPtr,
-) -> bool {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).recovery_download(&mut (*sk), startpos, endpos)
-    }
-}
-
-extern "C" fn wal_read(
-    sk: *mut Safekeeper,
-    buf: *mut ::std::os::raw::c_char,
-    startptr: XLogRecPtr,
-    count: Size,
-) {
-    unsafe {
-        let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).wal_read(&mut (*sk), buf, startptr)
-    }
-}
-
-extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).wal_reader_allocate(&mut (*sk));
-    }
-}
-
-extern "C" fn free_event_set(wp: *mut WalProposer) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).free_event_set(&mut (*wp));
-    }
-}
-
-extern "C" fn init_event_set(wp: *mut WalProposer) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).init_event_set(&mut (*wp));
-    }
-}
-
-extern "C" fn update_event_set(sk: *mut Safekeeper, events: uint32) {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).update_event_set(&mut (*sk), events);
-    }
-}
-
-extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).add_safekeeper_event_set(&mut (*sk), events);
-    }
-}
-
-extern "C" fn wait_event_set(
-    wp: *mut WalProposer,
-    timeout: ::std::os::raw::c_long,
-    event_sk: *mut *mut Safekeeper,
-    events: *mut uint32,
-) -> ::std::os::raw::c_int {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        let result = (*api).wait_event_set(&mut (*wp), timeout);
-        match result {
-            WaitResult::Latch => {
-                *event_sk = std::ptr::null_mut();
-                *events = crate::bindings::WL_LATCH_SET;
-                1
-            }
-            WaitResult::Timeout => {
-                *event_sk = std::ptr::null_mut();
-                *events = crate::bindings::WL_TIMEOUT;
-                0
-            }
-            WaitResult::Network(sk, event_mask) => {
-                *event_sk = sk;
-                *events = event_mask;
-                1
-            }
-        }
-    }
-}
-
-extern "C" fn strong_random(
-    wp: *mut WalProposer,
-    buf: *mut ::std::os::raw::c_void,
-    len: usize,
-) -> bool {
-    unsafe {
-        let buf = std::slice::from_raw_parts_mut(buf as *mut u8, len);
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).strong_random(buf)
-    }
-}
-
-extern "C" fn get_redo_start_lsn(wp: *mut WalProposer) -> XLogRecPtr {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).get_redo_start_lsn()
-    }
-}
-
-extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).finish_sync_safekeepers(lsn)
-    }
-}
-
-extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLogRecPtr) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).process_safekeeper_feedback(&mut (*wp), commit_lsn)
-    }
-}
-
-extern "C" fn confirm_wal_streamed(wp: *mut WalProposer, lsn: XLogRecPtr) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).confirm_wal_streamed(&mut (*wp), lsn)
-    }
-}
-
-extern "C" fn log_internal(
-    wp: *mut WalProposer,
-    level: ::std::os::raw::c_int,
-    line: *const ::std::os::raw::c_char,
-) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        let line = CStr::from_ptr(line);
-        let line = line.to_str().unwrap();
-        (*api).log_internal(&mut (*wp), Level::from(level as u32), line)
-    }
-}
-
-extern "C" fn after_election(wp: *mut WalProposer) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).after_election(&mut (*wp))
-    }
-}
-
-#[derive(Debug)]
-pub enum Level {
-    Debug5,
-    Debug4,
-    Debug3,
-    Debug2,
-    Debug1,
-    Log,
-    Info,
-    Notice,
-    Warning,
-    Error,
-    Fatal,
-    Panic,
-    WPEvent,
-}
-
-impl Level {
-    pub fn from(elevel: u32) -> Level {
-        use crate::bindings::*;
-
-        match elevel {
-            DEBUG5 => Level::Debug5,
-            DEBUG4 => Level::Debug4,
-            DEBUG3 => Level::Debug3,
-            DEBUG2 => Level::Debug2,
-            DEBUG1 => Level::Debug1,
-            LOG => Level::Log,
-            INFO => Level::Info,
-            NOTICE => Level::Notice,
-            WARNING => Level::Warning,
-            ERROR => Level::Error,
-            FATAL => Level::Fatal,
-            PANIC => Level::Panic,
-            WPEVENT => Level::WPEvent,
-            _ => panic!("unknown log level {}", elevel),
-        }
-    }
-}
-
-pub(crate) fn create_api() -> walproposer_api {
-    walproposer_api {
-        get_shmem_state: Some(get_shmem_state),
-        start_streaming: Some(start_streaming),
-        get_flush_rec_ptr: Some(get_flush_rec_ptr),
-        get_current_timestamp: Some(get_current_timestamp),
-        conn_error_message: Some(conn_error_message),
-        conn_status: Some(conn_status),
-        conn_connect_start: Some(conn_connect_start),
-        conn_connect_poll: Some(conn_connect_poll),
-        conn_send_query: Some(conn_send_query),
-        conn_get_query_result: Some(conn_get_query_result),
-        conn_flush: Some(conn_flush),
-        conn_finish: Some(conn_finish),
-        conn_async_read: Some(conn_async_read),
-        conn_async_write: Some(conn_async_write),
-        conn_blocking_write: Some(conn_blocking_write),
-        recovery_download: Some(recovery_download),
-        wal_read: Some(wal_read),
-        wal_reader_allocate: Some(wal_reader_allocate),
-        free_event_set: Some(free_event_set),
-        init_event_set: Some(init_event_set),
-        update_event_set: Some(update_event_set),
-        add_safekeeper_event_set: Some(add_safekeeper_event_set),
-        wait_event_set: Some(wait_event_set),
-        strong_random: Some(strong_random),
-        get_redo_start_lsn: Some(get_redo_start_lsn),
-        finish_sync_safekeepers: Some(finish_sync_safekeepers),
-        process_safekeeper_feedback: Some(process_safekeeper_feedback),
-        confirm_wal_streamed: Some(confirm_wal_streamed),
-        log_internal: Some(log_internal),
-        after_election: Some(after_election),
-    }
-}
-
-impl std::fmt::Display for Level {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        write!(f, "{:?}", self)
-    }
-}
-
-/// Take ownership of `Vec<u8>` from StringInfoData.
-pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option<Vec<u8>> {
-    if pg.data.is_null() {
-        return None;
-    }
-
-    let ptr = pg.data as *mut u8;
-    let length = pg.len as usize;
-    let capacity = pg.maxlen as usize;
-
-    pg.data = std::ptr::null_mut();
-    pg.len = 0;
-    pg.maxlen = 0;
-
-    unsafe { Some(Vec::from_raw_parts(ptr, length, capacity)) }
-}
-
-/// Store `Vec<u8>` in StringInfoData.
-fn store_vec_u8(pg: &mut StringInfoData, vec: Vec<u8>) -> *mut ::std::os::raw::c_char {
-    let ptr = vec.as_ptr() as *mut ::std::os::raw::c_char;
-    let length = vec.len();
-    let capacity = vec.capacity();
-
-    assert!(pg.data.is_null());
-
-    pg.data = ptr;
-    pg.len = length as i32;
-    pg.maxlen = capacity as i32;
-
-    std::mem::forget(vec);
-
-    ptr
-}
--- a/libs/walproposer/src/lib.rs
+++ b/libs/walproposer/src/lib.rs
@@ -1,14 +0,0 @@
-pub mod bindings {
-    #![allow(non_upper_case_globals)]
-    #![allow(non_camel_case_types)]
-    #![allow(non_snake_case)]
-    // bindgen creates some unsafe code with no doc comments.
-    #![allow(clippy::missing_safety_doc)]
-    // noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code.
-    #![allow(clippy::useless_transmute)]
-
-    include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
-}
-
-pub mod api_bindings;
-pub mod walproposer;
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -1,485 +0,0 @@
-use std::ffi::CString;
-
-use postgres_ffi::WAL_SEGMENT_SIZE;
-use utils::id::TenantTimelineId;
-
-use crate::{
-    api_bindings::{create_api, take_vec_u8, Level},
-    bindings::{
-        Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, WalProposerFree,
-        WalProposerStart,
-    },
-};
-
-/// Rust high-level wrapper for C walproposer API. Many methods are not required
-/// for simple cases, hence todo!() in default implementations.
-///
-/// Refer to `pgxn/neon/walproposer.h` for documentation.
-pub trait ApiImpl {
-    fn get_shmem_state(&self) -> &mut crate::bindings::WalproposerShmemState {
-        todo!()
-    }
-
-    fn start_streaming(&self, _startpos: u64) {
-        todo!()
-    }
-
-    fn get_flush_rec_ptr(&self) -> u64 {
-        todo!()
-    }
-
-    fn get_current_timestamp(&self) -> i64 {
-        todo!()
-    }
-
-    fn conn_error_message(&self, _sk: &mut Safekeeper) -> String {
-        todo!()
-    }
-
-    fn conn_status(&self, _sk: &mut Safekeeper) -> crate::bindings::WalProposerConnStatusType {
-        todo!()
-    }
-
-    fn conn_connect_start(&self, _sk: &mut Safekeeper) {
-        todo!()
-    }
-
-    fn conn_connect_poll(
-        &self,
-        _sk: &mut Safekeeper,
-    ) -> crate::bindings::WalProposerConnectPollStatusType {
-        todo!()
-    }
-
-    fn conn_send_query(&self, _sk: &mut Safekeeper, _query: &str) -> bool {
-        todo!()
-    }
-
-    fn conn_get_query_result(
-        &self,
-        _sk: &mut Safekeeper,
-    ) -> crate::bindings::WalProposerExecStatusType {
-        todo!()
-    }
-
-    fn conn_flush(&self, _sk: &mut Safekeeper) -> i32 {
-        todo!()
-    }
-
-    fn conn_finish(&self, _sk: &mut Safekeeper) {
-        todo!()
-    }
-
-    fn conn_async_read(&self, _sk: &mut Safekeeper) -> (&[u8], crate::bindings::PGAsyncReadResult) {
-        todo!()
-    }
-
-    fn conn_async_write(
-        &self,
-        _sk: &mut Safekeeper,
-        _buf: &[u8],
-    ) -> crate::bindings::PGAsyncWriteResult {
-        todo!()
-    }
-
-    fn conn_blocking_write(&self, _sk: &mut Safekeeper, _buf: &[u8]) -> bool {
-        todo!()
-    }
-
-    fn recovery_download(&self, _sk: &mut Safekeeper, _startpos: u64, _endpos: u64) -> bool {
-        todo!()
-    }
-
-    fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) {
-        todo!()
-    }
-
-    fn wal_reader_allocate(&self, _sk: &mut Safekeeper) {
-        todo!()
-    }
-
-    fn free_event_set(&self, _wp: &mut WalProposer) {
-        todo!()
-    }
-
-    fn init_event_set(&self, _wp: &mut WalProposer) {
-        todo!()
-    }
-
-    fn update_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
-        todo!()
-    }
-
-    fn add_safekeeper_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
-        todo!()
-    }
-
-    fn wait_event_set(&self, _wp: &mut WalProposer, _timeout_millis: i64) -> WaitResult {
-        todo!()
-    }
-
-    fn strong_random(&self, _buf: &mut [u8]) -> bool {
-        todo!()
-    }
-
-    fn get_redo_start_lsn(&self) -> u64 {
-        todo!()
-    }
-
-    fn finish_sync_safekeepers(&self, _lsn: u64) {
-        todo!()
-    }
-
-    fn process_safekeeper_feedback(&self, _wp: &mut WalProposer, _commit_lsn: u64) {
-        todo!()
-    }
-
-    fn confirm_wal_streamed(&self, _wp: &mut WalProposer, _lsn: u64) {
-        todo!()
-    }
-
-    fn log_internal(&self, _wp: &mut WalProposer, _level: Level, _msg: &str) {
-        todo!()
-    }
-
-    fn after_election(&self, _wp: &mut WalProposer) {
-        todo!()
-    }
-}
-
-pub enum WaitResult {
-    Latch,
-    Timeout,
-    Network(*mut Safekeeper, u32),
-}
-
-pub struct Config {
-    /// Tenant and timeline id
-    pub ttid: TenantTimelineId,
-    /// List of safekeepers in format `host:port`
-    pub safekeepers_list: Vec<String>,
-    /// Safekeeper reconnect timeout in milliseconds
-    pub safekeeper_reconnect_timeout: i32,
-    /// Safekeeper connection timeout in milliseconds
-    pub safekeeper_connection_timeout: i32,
-    /// walproposer mode, finish when all safekeepers are synced or subscribe
-    /// to WAL streaming
-    pub sync_safekeepers: bool,
-}
-
-/// WalProposer main struct. C methods are reexported as Rust functions.
-pub struct Wrapper {
-    wp: *mut WalProposer,
-    _safekeepers_list_vec: Vec<u8>,
-}
-
-impl Wrapper {
-    pub fn new(api: Box<dyn ApiImpl>, config: Config) -> Wrapper {
-        let neon_tenant = CString::new(config.ttid.tenant_id.to_string())
-            .unwrap()
-            .into_raw();
-        let neon_timeline = CString::new(config.ttid.timeline_id.to_string())
-            .unwrap()
-            .into_raw();
-
-        let mut safekeepers_list_vec = CString::new(config.safekeepers_list.join(","))
-            .unwrap()
-            .into_bytes_with_nul();
-        assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
-        let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut i8;
-
-        let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;
-
-        let c_config = WalProposerConfig {
-            neon_tenant,
-            neon_timeline,
-            safekeepers_list,
-            safekeeper_reconnect_timeout: config.safekeeper_reconnect_timeout,
-            safekeeper_connection_timeout: config.safekeeper_connection_timeout,
-            wal_segment_size: WAL_SEGMENT_SIZE as i32, // default 16MB
-            syncSafekeepers: config.sync_safekeepers,
-            systemId: 0,
-            pgTimeline: 1,
-            callback_data,
-        };
-        let c_config = Box::into_raw(Box::new(c_config));
-
-        let api = create_api();
-        let wp = unsafe { WalProposerCreate(c_config, api) };
-        Wrapper {
-            wp,
-            _safekeepers_list_vec: safekeepers_list_vec,
-        }
-    }
-
-    pub fn start(&self) {
-        unsafe { WalProposerStart(self.wp) }
-    }
-}
-
-impl Drop for Wrapper {
-    fn drop(&mut self) {
-        unsafe {
-            let config = (*self.wp).config;
-            drop(Box::from_raw(
-                (*config).callback_data as *mut Box<dyn ApiImpl>,
-            ));
-            drop(CString::from_raw((*config).neon_tenant));
-            drop(CString::from_raw((*config).neon_timeline));
-            drop(Box::from_raw(config));
-
-            for i in 0..(*self.wp).n_safekeepers {
-                let sk = &mut (*self.wp).safekeeper[i as usize];
-                take_vec_u8(&mut sk.inbuf);
-            }
-
-            WalProposerFree(self.wp);
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::{
-        cell::Cell,
-        sync::{atomic::AtomicUsize, mpsc::sync_channel},
-    };
-
-    use utils::id::TenantTimelineId;
-
-    use crate::{api_bindings::Level, walproposer::Wrapper};
-
-    use super::ApiImpl;
-
-    #[derive(Clone, Copy, Debug)]
-    struct WaitEventsData {
-        sk: *mut crate::bindings::Safekeeper,
-        event_mask: u32,
-    }
-
-    struct MockImpl {
-        // data to return from wait_event_set
-        wait_events: Cell<WaitEventsData>,
-        // walproposer->safekeeper messages
-        expected_messages: Vec<Vec<u8>>,
-        expected_ptr: AtomicUsize,
-        // safekeeper->walproposer messages
-        safekeeper_replies: Vec<Vec<u8>>,
-        replies_ptr: AtomicUsize,
-        // channel to send LSN to the main thread
-        sync_channel: std::sync::mpsc::SyncSender<u64>,
-    }
-
-    impl MockImpl {
-        fn check_walproposer_msg(&self, msg: &[u8]) {
-            let ptr = self
-                .expected_ptr
-                .fetch_add(1, std::sync::atomic::Ordering::SeqCst);
-
-            if ptr >= self.expected_messages.len() {
-                panic!("unexpected message from walproposer");
-            }
-
-            let expected_msg = &self.expected_messages[ptr];
-            assert_eq!(msg, expected_msg.as_slice());
-        }
-
-        fn next_safekeeper_reply(&self) -> &[u8] {
-            let ptr = self
-                .replies_ptr
-                .fetch_add(1, std::sync::atomic::Ordering::SeqCst);
-
-            if ptr >= self.safekeeper_replies.len() {
-                panic!("no more safekeeper replies");
-            }
-
-            &self.safekeeper_replies[ptr]
-        }
-    }
-
-    impl ApiImpl for MockImpl {
-        fn get_current_timestamp(&self) -> i64 {
-            println!("get_current_timestamp");
-            0
-        }
-
-        fn conn_status(
-            &self,
-            _: &mut crate::bindings::Safekeeper,
-        ) -> crate::bindings::WalProposerConnStatusType {
-            println!("conn_status");
-            crate::bindings::WalProposerConnStatusType_WP_CONNECTION_OK
-        }
-
-        fn conn_connect_start(&self, _: &mut crate::bindings::Safekeeper) {
-            println!("conn_connect_start");
-        }
-
-        fn conn_connect_poll(
-            &self,
-            _: &mut crate::bindings::Safekeeper,
-        ) -> crate::bindings::WalProposerConnectPollStatusType {
-            println!("conn_connect_poll");
-            crate::bindings::WalProposerConnectPollStatusType_WP_CONN_POLLING_OK
-        }
-
-        fn conn_send_query(&self, _: &mut crate::bindings::Safekeeper, query: &str) -> bool {
-            println!("conn_send_query: {}", query);
-            true
-        }
-
-        fn conn_get_query_result(
-            &self,
-            _: &mut crate::bindings::Safekeeper,
-        ) -> crate::bindings::WalProposerExecStatusType {
-            println!("conn_get_query_result");
-            crate::bindings::WalProposerExecStatusType_WP_EXEC_SUCCESS_COPYBOTH
-        }
-
-        fn conn_async_read(
-            &self,
-            _: &mut crate::bindings::Safekeeper,
-        ) -> (&[u8], crate::bindings::PGAsyncReadResult) {
-            println!("conn_async_read");
-            let reply = self.next_safekeeper_reply();
-            println!("conn_async_read result: {:?}", reply);
-            (
-                reply,
-                crate::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS,
-            )
-        }
-
-        fn conn_blocking_write(&self, _: &mut crate::bindings::Safekeeper, buf: &[u8]) -> bool {
-            println!("conn_blocking_write: {:?}", buf);
-            self.check_walproposer_msg(buf);
-            true
-        }
-
-        fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) {
-            println!("wal_reader_allocate")
-        }
-
-        fn free_event_set(&self, _: &mut crate::bindings::WalProposer) {
-            println!("free_event_set")
-        }
-
-        fn init_event_set(&self, _: &mut crate::bindings::WalProposer) {
-            println!("init_event_set")
-        }
-
-        fn update_event_set(&self, sk: &mut crate::bindings::Safekeeper, event_mask: u32) {
-            println!(
-                "update_event_set, sk={:?}, events_mask={:#b}",
-                sk as *mut crate::bindings::Safekeeper, event_mask
-            );
-            self.wait_events.set(WaitEventsData { sk, event_mask });
-        }
-
-        fn add_safekeeper_event_set(&self, sk: &mut crate::bindings::Safekeeper, event_mask: u32) {
-            println!(
-                "add_safekeeper_event_set, sk={:?}, events_mask={:#b}",
-                sk as *mut crate::bindings::Safekeeper, event_mask
-            );
-            self.wait_events.set(WaitEventsData { sk, event_mask });
-        }
-
-        fn wait_event_set(
-            &self,
-            _: &mut crate::bindings::WalProposer,
-            timeout_millis: i64,
-        ) -> super::WaitResult {
-            let data = self.wait_events.get();
-            println!(
-                "wait_event_set, timeout_millis={}, res={:?}",
-                timeout_millis, data
-            );
-            super::WaitResult::Network(data.sk, data.event_mask)
-        }
-
-        fn strong_random(&self, buf: &mut [u8]) -> bool {
-            println!("strong_random");
-            buf.fill(0);
-            true
-        }
-
-        fn finish_sync_safekeepers(&self, lsn: u64) {
-            self.sync_channel.send(lsn).unwrap();
-            panic!("sync safekeepers finished at lsn={}", lsn);
-        }
-
-        fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
-            println!("walprop_log[{}] {}", level, msg);
-        }
-
-        fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {
-            println!("after_election");
-        }
-    }
-
-    /// Test that walproposer can successfully connect to safekeeper and finish
-    /// sync_safekeepers. API is mocked in MockImpl.
-    ///
-    /// Run this test with valgrind to detect leaks:
-    /// `valgrind --leak-check=full target/debug/deps/walproposer-<build>`
-    #[test]
-    fn test_simple_sync_safekeepers() -> anyhow::Result<()> {
-        let ttid = TenantTimelineId::new(
-            "9e4c8f36063c6c6e93bc20d65a820f3d".parse()?,
-            "9e4c8f36063c6c6e93bc20d65a820f3d".parse()?,
-        );
-
-        let (sender, receiver) = sync_channel(1);
-
-        let my_impl: Box<dyn ApiImpl> = Box::new(MockImpl {
-            wait_events: Cell::new(WaitEventsData {
-                sk: std::ptr::null_mut(),
-                event_mask: 0,
-            }),
-            expected_messages: vec![
-                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160000, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
-                vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
-                    147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
-                    188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
-                ],
-                // VoteRequest(VoteRequest { term: 3 })
-                vec![
-                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                    0, 0, 0, 0, 0, 0,
-                ],
-            ],
-            expected_ptr: AtomicUsize::new(0),
-            safekeeper_replies: vec![
-                // Greeting(AcceptorGreeting { term: 2, node_id: NodeId(1) })
-                vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
-                ],
-                // VoteResponse(VoteResponse { term: 3, vote_given: 1, flush_lsn: 0/539, truncate_lsn: 0/539, term_history: [(2, 0/539)], timeline_start_lsn: 0/539 })
-                vec![
-                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 57,
-                    5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
-                    0, 57, 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0,
-                ],
-            ],
-            replies_ptr: AtomicUsize::new(0),
-            sync_channel: sender,
-        });
-        let config = crate::walproposer::Config {
-            ttid,
-            safekeepers_list: vec!["localhost:5000".to_string()],
-            safekeeper_reconnect_timeout: 1000,
-            safekeeper_connection_timeout: 10000,
-            sync_safekeepers: true,
-        };
-
-        let wp = Wrapper::new(my_impl, config);
-
-        // walproposer will panic when it finishes sync_safekeepers
-        std::panic::catch_unwind(|| wp.start()).unwrap_err();
-        // validate the resulting LSN
-        assert_eq!(receiver.recv()?, 1337);
-        Ok(())
-        // drop() will free up resources here
-    }
-}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -37,6 +37,7 @@ humantime-serde.workspace = true
 hyper.workspace = true
 itertools.workspace = true
 nix.workspace = true
+nostarve_queue.workspace = true
 # hack to get the number of worker threads tokio uses
 num_cpus = { version = "1.15" }
 num-traits.workspace = true
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -11,7 +11,10 @@ use std::sync::{Arc, Barrier};

 use bytes::{Buf, Bytes};
 use pageserver::{
-    config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
+    config::PageServerConf,
+    repository::Key,
+    walrecord::NeonWalRecord,
+    walredo::{PostgresRedoManager, WalRedoError},
 };
 use utils::{id::TenantId, lsn::Lsn};

@@ -32,15 +35,9 @@ fn redo_scenarios(c: &mut Criterion) {

    let manager = Arc::new(manager);

-    {
-        let rt = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .unwrap();
-        tracing::info!("executing first");
-        short().execute(rt.handle(), &manager).unwrap();
-        tracing::info!("first executed");
-    }
+    tracing::info!("executing first");
+    short().execute(&manager).unwrap();
+    tracing::info!("first executed");

    let thread_counts = [1, 2, 4, 8, 16];

@@ -83,14 +80,9 @@ fn add_multithreaded_walredo_requesters(
    assert_ne!(threads, 0);

    if threads == 1 {
-        let rt = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .unwrap();
-        let handle = rt.handle();
        b.iter_batched_ref(
            || Some(input_factory()),
-            |input| execute_all(input.take(), handle, manager),
+            |input| execute_all(input.take(), manager),
            criterion::BatchSize::PerIteration,
        );
    } else {
@@ -106,26 +98,19 @@ fn add_multithreaded_walredo_requesters(
                    let manager = manager.clone();
                    let barrier = barrier.clone();
                    let work_rx = work_rx.clone();
-                    move || {
-                        let rt = tokio::runtime::Builder::new_current_thread()
-                            .enable_all()
-                            .build()
-                            .unwrap();
-                        let handle = rt.handle();
-                        loop {
-                            // queue up and wait if we want to go another round
-                            if work_rx.lock().unwrap().recv().is_err() {
-                                break;
-                            }
-
-                            let input = Some(input_factory());
-
-                            barrier.wait();
-
-                            execute_all(input, handle, &manager).unwrap();
-
-                            barrier.wait();
+                    move || loop {
+                        // queue up and wait if we want to go another round
+                        if work_rx.lock().unwrap().recv().is_err() {
+                            break;
                        }
+
+                        let input = Some(input_factory());
+
+                        barrier.wait();
+
+                        execute_all(input, &manager).unwrap();
+
+                        barrier.wait();
                    }
                })
            })
@@ -167,19 +152,15 @@ impl Drop for JoinOnDrop {
    }
 }

-fn execute_all<I>(
-    input: I,
-    handle: &tokio::runtime::Handle,
-    manager: &PostgresRedoManager,
-) -> anyhow::Result<()>
+fn execute_all<I>(input: I, manager: &PostgresRedoManager) -> Result<(), WalRedoError>
 where
    I: IntoIterator<Item = Request>,
 {
    // just fire all requests as fast as possible
    input.into_iter().try_for_each(|req| {
-        let page = req.execute(handle, manager)?;
+        let page = req.execute(manager)?;
        assert_eq!(page.remaining(), 8192);
-        anyhow::Ok(())
+        Ok::<_, WalRedoError>(())
    })
 }

@@ -492,11 +473,9 @@ struct Request {
 }

 impl Request {
-    fn execute(
-        self,
-        rt: &tokio::runtime::Handle,
-        manager: &PostgresRedoManager,
-    ) -> anyhow::Result<Bytes> {
+    fn execute(self, manager: &PostgresRedoManager) -> Result<Bytes, WalRedoError> {
+        use pageserver::walredo::WalRedoManager;
+
        let Request {
            key,
            lsn,
@@ -505,6 +484,6 @@ impl Request {
            pg_version,
        } = self;

-        rt.block_on(manager.request_redo(key, lsn, base_img, records, pg_version))
+        manager.request_redo(key, lsn, base_img, records, pg_version)
    }
 }
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,7 +13,6 @@
 use anyhow::{anyhow, bail, ensure, Context};
 use bytes::{BufMut, BytesMut};
 use fail::fail_point;
-use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
 use tokio::io;
@@ -181,7 +180,6 @@ where
            }
        }

-        let mut min_restart_lsn: Lsn = Lsn::MAX;
        // Create tablespace directories
        for ((spcnode, dbnode), has_relmap_file) in
            self.timeline.list_dbdirs(self.lsn, self.ctx).await?
@@ -215,34 +213,6 @@ where
                    self.add_rel(rel, rel).await?;
                }
            }
-
-            for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? {
-                if path.starts_with("pg_replslot") {
-                    let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
-                    let restart_lsn = Lsn(u64::from_le_bytes(
-                        content[offs..offs + 8].try_into().unwrap(),
-                    ));
-                    info!("Replication slot {} restart LSN={}", path, restart_lsn);
-                    min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
-                }
-                let header = new_tar_header(&path, content.len() as u64)?;
-                self.ar
-                    .append(&header, &*content)
-                    .await
-                    .context("could not add aux file to basebackup tarball")?;
-            }
-        }
-        if min_restart_lsn != Lsn::MAX {
-            info!(
-                "Min restart LSN for logical replication is {}",
-                min_restart_lsn
-            );
-            let data = min_restart_lsn.0.to_le_bytes();
-            let header = new_tar_header("restart.lsn", data.len() as u64)?;
-            self.ar
-                .append(&header, &data[..])
-                .await
-                .context("could not add restart.lsn file to basebackup tarball")?;
        }
        for xid in self
            .timeline
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -2,7 +2,6 @@

 use std::env::{var, VarError};
 use std::sync::Arc;
-use std::time::Duration;
 use std::{env, ops::ControlFlow, str::FromStr};

 use anyhow::{anyhow, Context};
@@ -14,7 +13,7 @@ use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
-use pageserver::tenant::{secondary, TenantSharedResources};
+use pageserver::tenant::TenantSharedResources;
 use remote_storage::GenericRemoteStorage;
 use tokio::time::Instant;
 use tracing::*;
@@ -201,51 +200,6 @@ fn initialize_config(
    })
 }

-struct WaitForPhaseResult<F: std::future::Future + Unpin> {
-    timeout_remaining: Duration,
-    skipped: Option<F>,
-}
-
-/// During startup, we apply a timeout to our waits for readiness, to avoid
-/// stalling the whole service if one Tenant experiences some problem.  Each
-/// phase may consume some of the timeout: this function returns the updated
-/// timeout for use in the next call.
-async fn wait_for_phase<F>(phase: &str, mut fut: F, timeout: Duration) -> WaitForPhaseResult<F>
-where
-    F: std::future::Future + Unpin,
-{
-    let initial_t = Instant::now();
-    let skipped = match tokio::time::timeout(timeout, &mut fut).await {
-        Ok(_) => None,
-        Err(_) => {
-            tracing::info!(
-                timeout_millis = timeout.as_millis(),
-                %phase,
-                "Startup phase timed out, proceeding anyway"
-            );
-            Some(fut)
-        }
-    };
-
-    WaitForPhaseResult {
-        timeout_remaining: timeout
-            .checked_sub(Instant::now().duration_since(initial_t))
-            .unwrap_or(Duration::ZERO),
-        skipped,
-    }
-}
-
-fn startup_checkpoint(started_at: Instant, phase: &str, human_phase: &str) {
-    let elapsed = started_at.elapsed();
-    let secs = elapsed.as_secs_f64();
-    STARTUP_DURATION.with_label_values(&[phase]).set(secs);
-
-    info!(
-        elapsed_ms = elapsed.as_millis(),
-        "{human_phase} ({secs:.3}s since start)"
-    )
-}
-
 fn start_pageserver(
    launch_ts: &'static LaunchTimestamp,
    conf: &'static PageServerConf,
@@ -253,6 +207,16 @@ fn start_pageserver(
    // Monotonic time for later calculating startup duration
    let started_startup_at = Instant::now();

+    let startup_checkpoint = move |phase: &str, human_phase: &str| {
+        let elapsed = started_startup_at.elapsed();
+        let secs = elapsed.as_secs_f64();
+        STARTUP_DURATION.with_label_values(&[phase]).set(secs);
+        info!(
+            elapsed_ms = elapsed.as_millis(),
+            "{human_phase} ({secs:.3}s since start)"
+        )
+    };
+
    // Print version and launch timestamp to the log,
    // and expose them as prometheus metrics.
    // A changed version string indicates changed software.
@@ -377,7 +341,7 @@ fn start_pageserver(

    // Up to this point no significant I/O has been done: this should have been fast.  Record
    // duration prior to starting I/O intensive phase of startup.
-    startup_checkpoint(started_startup_at, "initial", "Starting loading tenants");
+    startup_checkpoint("initial", "Starting loading tenants");
    STARTUP_IS_LOADING.set(1);

    // Startup staging or optimizing:
@@ -391,7 +355,6 @@ fn start_pageserver(
    // consumer side) will be dropped once we can start the background jobs. Currently it is behind
    // completing all initial logical size calculations (init_logical_size_done_rx) and a timeout
    // (background_task_maximum_delay).
-    let (init_remote_done_tx, init_remote_done_rx) = utils::completion::channel();
    let (init_done_tx, init_done_rx) = utils::completion::channel();

    let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
@@ -399,8 +362,7 @@ fn start_pageserver(
    let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();

    let order = pageserver::InitializationOrder {
-        initial_tenant_load_remote: Some(init_done_tx),
-        initial_tenant_load: Some(init_remote_done_tx),
+        initial_tenant_load: Some(init_done_tx),
        initial_logical_size_can_start: init_done_rx.clone(),
        initial_logical_size_attempt: Some(init_logical_size_done_tx),
        background_jobs_can_start: background_jobs_barrier.clone(),
@@ -408,7 +370,7 @@ fn start_pageserver(

    // Scan the local 'tenants/' directory and start loading the tenants
    let deletion_queue_client = deletion_queue.new_client();
-    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
+    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
        TenantSharedResources {
            broker_client: broker_client.clone(),
@@ -418,100 +380,61 @@ fn start_pageserver(
        order,
        shutdown_pageserver.clone(),
    ))?;
-    let tenant_manager = Arc::new(tenant_manager);

    BACKGROUND_RUNTIME.spawn({
        let init_done_rx = init_done_rx;
        let shutdown_pageserver = shutdown_pageserver.clone();
        let drive_init = async move {
            // NOTE: unlike many futures in pageserver, this one is cancellation-safe
-            let guard = scopeguard::guard_on_success((), |_| {
-                tracing::info!("Cancelled before initial load completed")
-            });
+            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));

-            let timeout = conf.background_task_maximum_delay;
-
-            let init_remote_done = std::pin::pin!(async {
-                init_remote_done_rx.wait().await;
-                startup_checkpoint(
-                    started_startup_at,
-                    "initial_tenant_load_remote",
-                    "Remote part of initial load completed",
-                );
-            });
-
-            let WaitForPhaseResult {
-                timeout_remaining: timeout,
-                skipped: init_remote_skipped,
-            } = wait_for_phase("initial_tenant_load_remote", init_remote_done, timeout).await;
-
-            let init_load_done = std::pin::pin!(async {
-                init_done_rx.wait().await;
-                startup_checkpoint(
-                    started_startup_at,
-                    "initial_tenant_load",
-                    "Initial load completed",
-                );
-                STARTUP_IS_LOADING.set(0);
-            });
-
-            let WaitForPhaseResult {
-                timeout_remaining: timeout,
-                skipped: init_load_skipped,
-            } = wait_for_phase("initial_tenant_load", init_load_done, timeout).await;
+            init_done_rx.wait().await;
+            startup_checkpoint("initial_tenant_load", "Initial load completed");
+            STARTUP_IS_LOADING.set(0);

            // initial logical sizes can now start, as they were waiting on init_done_rx.

            scopeguard::ScopeGuard::into_inner(guard);

-            let guard = scopeguard::guard_on_success((), |_| {
-                tracing::info!("Cancelled before initial logical sizes completed")
-            });
+            let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());

-            let logical_sizes_done = std::pin::pin!(async {
-                init_logical_size_done_rx.wait().await;
-                startup_checkpoint(
-                    started_startup_at,
-                    "initial_logical_sizes",
-                    "Initial logical sizes completed",
-                );
-            });
+            let timeout = conf.background_task_maximum_delay;

-            let WaitForPhaseResult {
-                timeout_remaining: _,
-                skipped: logical_sizes_skipped,
-            } = wait_for_phase("initial_logical_sizes", logical_sizes_done, timeout).await;
+            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
+
+            let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
+                Ok(_) => {
+                    startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed");
+                    None
+                }
+                Err(_) => {
+                    tracing::info!(
+                        timeout_millis = timeout.as_millis(),
+                        "Initial logical size timeout elapsed; starting background jobs"
+                    );
+                    Some(init_sizes_done)
+                }
+            };

            scopeguard::ScopeGuard::into_inner(guard);

-            // allow background jobs to start: we either completed prior stages, or they reached timeout
-            // and were skipped.  It is important that we do not let them block background jobs indefinitely,
-            // because things like consumption metrics for billing are blocked by this barrier.
+            // allow background jobs to start
            drop(background_jobs_can_start);
-            startup_checkpoint(
-                started_startup_at,
-                "background_jobs_can_start",
-                "Starting background jobs",
-            );
+            startup_checkpoint("background_jobs_can_start", "Starting background jobs");

-            // We are done. If we skipped any phases due to timeout, run them to completion here so that
-            // they will eventually update their startup_checkpoint, and so that we do not declare the
-            // 'complete' stage until all the other stages are really done.
-            let guard = scopeguard::guard_on_success((), |_| {
-                tracing::info!("Cancelled before waiting for skipped phases done")
-            });
-            if let Some(f) = init_remote_skipped {
-                f.await;
-            }
-            if let Some(f) = init_load_skipped {
-                f.await;
-            }
-            if let Some(f) = logical_sizes_skipped {
-                f.await;
-            }
-            scopeguard::ScopeGuard::into_inner(guard);
+            if let Some(init_sizes_done) = init_sizes_done {
+                // ending up here is not a bug; at the latest logical sizes will be queried by
+                // consumption metrics.
+                let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
+                init_sizes_done.await;

-            startup_checkpoint(started_startup_at, "complete", "Startup complete");
+                scopeguard::ScopeGuard::into_inner(guard);
+
+                startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed after timeout (background jobs already started)");
+
+            }
+
+            startup_checkpoint("complete", "Startup complete");
        };

        async move {
@@ -524,18 +447,6 @@ fn start_pageserver(
        }
    });

-    let secondary_controller = if let Some(remote_storage) = &remote_storage {
-        secondary::spawn_tasks(
-            conf,
-            tenant_manager.clone(),
-            remote_storage.clone(),
-            background_jobs_barrier.clone(),
-            shutdown_pageserver.clone(),
-        )
-    } else {
-        secondary::null_controller()
-    };
-
    // shared state between the disk-usage backed eviction background task and the http endpoint
    // that allows triggering disk-usage based eviction manually. note that the http endpoint
    // is still accessible even if background task is not configured as long as remote storage has
@@ -547,7 +458,6 @@ fn start_pageserver(
            conf,
            remote_storage.clone(),
            disk_usage_eviction_state.clone(),
-            tenant_manager.clone(),
            background_jobs_barrier.clone(),
        )?;
    }
@@ -560,13 +470,11 @@ fn start_pageserver(
        let router_state = Arc::new(
            http::routes::State::new(
                conf,
-                tenant_manager,
                http_auth.clone(),
                remote_storage.clone(),
                broker_client.clone(),
                disk_usage_eviction_state,
                deletion_queue.new_client(),
-                secondary_controller,
            )
            .context("Failed to initialize router state")?,
        );
@@ -666,13 +574,37 @@ fn start_pageserver(
                    pageserver_listener,
                    conf.pg_auth_type,
                    libpq_ctx,
-                    task_mgr::shutdown_token(),
                )
                .await
            },
        );
    }

+    task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        TaskKind::BackgroundRuntimeTurnaroundMeasure,
+        None,
+        None,
+        "background runtime turnaround measure",
+        true,
+        async move {
+            let server = hyper::Server::try_bind(&"0.0.0.0:2342".parse().unwrap()).expect("bind");
+            let server = server
+                .serve(hyper::service::make_service_fn(|_| async move {
+                    Ok::<_, std::convert::Infallible>(hyper::service::service_fn(
+                        move |_: hyper::Request<hyper::Body>| async move {
+                            Ok::<_, std::convert::Infallible>(hyper::Response::new(
+                                hyper::Body::from(format!("alive")),
+                            ))
+                        },
+                    ))
+                }))
+                .with_graceful_shutdown(task_mgr::shutdown_watcher());
+            server.await?;
+            Ok(())
+        },
+    );
+
    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());

    // All started up! Now just sit and wait for shutdown signal.
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -211,10 +211,6 @@ pub struct PageServerConf {

    /// JWT token for use with the control plane API.
    pub control_plane_api_token: Option<SecretString>,
-
-    /// If true, pageserver will make best-effort to operate without a control plane: only
-    /// for use in major incidents.
-    pub control_plane_emergency_mode: bool,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -292,7 +288,6 @@ struct PageServerConfigBuilder {

    control_plane_api: BuilderValue<Option<Url>>,
    control_plane_api_token: BuilderValue<Option<SecretString>>,
-    control_plane_emergency_mode: BuilderValue<bool>,
 }

 impl Default for PageServerConfigBuilder {
@@ -360,7 +355,6 @@ impl Default for PageServerConfigBuilder {

            control_plane_api: Set(None),
            control_plane_api_token: Set(None),
-            control_plane_emergency_mode: Set(false),
        }
    }
 }
@@ -497,10 +491,6 @@ impl PageServerConfigBuilder {
        self.control_plane_api_token = BuilderValue::Set(token)
    }

-    pub fn control_plane_emergency_mode(&mut self, enabled: bool) {
-        self.control_plane_emergency_mode = BuilderValue::Set(enabled)
-    }
-
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let concurrent_tenant_size_logical_size_queries = self
            .concurrent_tenant_size_logical_size_queries
@@ -592,9 +582,6 @@ impl PageServerConfigBuilder {
            control_plane_api_token: self
                .control_plane_api_token
                .ok_or(anyhow!("missing control_plane_api_token"))?,
-            control_plane_emergency_mode: self
-                .control_plane_emergency_mode
-                .ok_or(anyhow!("missing control_plane_emergency_mode"))?,
        })
    }
 }
@@ -820,10 +807,6 @@ impl PageServerConf {
                        builder.control_plane_api_token(Some(parsed.into()))
                    }
                },
-                "control_plane_emergency_mode" => {
-                    builder.control_plane_emergency_mode(parse_toml_bool(key, item)?)
-
-                },
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -993,7 +976,6 @@ impl PageServerConf {
            background_task_maximum_delay: Duration::ZERO,
            control_plane_api: None,
            control_plane_api_token: None,
-            control_plane_emergency_mode: false,
        }
    }
 }
@@ -1217,8 +1199,7 @@ background_task_maximum_delay = '334 s'
                    defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
                )?,
                control_plane_api: None,
-                control_plane_api_token: None,
-                control_plane_emergency_mode: false
+                control_plane_api_token: None
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1274,8 +1255,7 @@ background_task_maximum_delay = '334 s'
                ondemand_download_behavior_treat_error_as_warn: false,
                background_task_maximum_delay: Duration::from_secs(334),
                control_plane_api: None,
-                control_plane_api_token: None,
-                control_plane_emergency_mode: false
+                control_plane_api_token: None
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -11,7 +11,6 @@ use reqwest::Url;
 use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, SystemTime};
-use tokio::time::Instant;
 use tracing::*;
 use utils::id::NodeId;

@@ -89,12 +88,22 @@ pub async fn collect_metrics(

    let node_id = node_id.to_string();

+    // reminder: ticker is ready immediatedly
+    let mut ticker = tokio::time::interval(metric_collection_interval);
+
    loop {
-        let started_at = Instant::now();
+        let tick_at = tokio::select! {
+            _ = cancel.cancelled() => return Ok(()),
+            tick_at = ticker.tick() => tick_at,
+        };

        // these are point in time, with variable "now"
        let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;

+        if metrics.is_empty() {
+            continue;
+        }
+
        let metrics = Arc::new(metrics);

        // why not race cancellation here? because we are one of the last tasks, and if we are
@@ -133,19 +142,10 @@ pub async fn collect_metrics(
        let (_, _) = tokio::join!(flush, upload);

        crate::tenant::tasks::warn_when_period_overrun(
-            started_at.elapsed(),
+            tick_at.elapsed(),
            metric_collection_interval,
            BackgroundLoopKind::ConsumptionMetricsCollectMetrics,
        );
-
-        let res = tokio::time::timeout_at(
-            started_at + metric_collection_interval,
-            task_mgr::shutdown_token().cancelled(),
-        )
-        .await;
-        if res.is_ok() {
-            return Ok(());
-        }
    }
 }

@@ -244,14 +244,16 @@ async fn calculate_synthetic_size_worker(
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
    info!("starting calculate_synthetic_size_worker");
-    scopeguard::defer! {
-        info!("calculate_synthetic_size_worker stopped");
-    };

+    // reminder: ticker is ready immediatedly
+    let mut ticker = tokio::time::interval(synthetic_size_calculation_interval);
    let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;

    loop {
-        let started_at = Instant::now();
+        let tick_at = tokio::select! {
+            _ = task_mgr::shutdown_watcher() => return Ok(()),
+            tick_at = ticker.tick() => tick_at,
+        };

        let tenants = match mgr::list_tenants().await {
            Ok(tenants) => tenants,
@@ -266,8 +268,8 @@ async fn calculate_synthetic_size_worker(
                continue;
            }

-            if let Ok(tenant) = mgr::get_tenant(tenant_id, true) {
-                // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
+            if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
+                // TODO should we just use concurrent_background_tasks_rate_limit().
                // We can put in some prioritization for consumption metrics.
                // Same for the loop that fetches computed metrics.
                // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
@@ -279,18 +281,9 @@ async fn calculate_synthetic_size_worker(
        }

        crate::tenant::tasks::warn_when_period_overrun(
-            started_at.elapsed(),
+            tick_at.elapsed(),
            synthetic_size_calculation_interval,
            BackgroundLoopKind::ConsumptionMetricsSyntheticSizeWorker,
        );
-
-        let res = tokio::time::timeout_at(
-            started_at + synthetic_size_calculation_interval,
-            task_mgr::shutdown_token().cancelled(),
-        )
-        .await;
-        if res.is_ok() {
-            return Ok(());
-        }
    }
 }
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -206,6 +206,7 @@ pub(super) async fn collect_all_metrics(
            None
        } else {
            crate::tenant::mgr::get_tenant(id, true)
+                .await
                .ok()
                .map(|tenant| (id, tenant))
        }
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -133,8 +133,6 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
            node_id: self.node_id,
        };

-        fail::fail_point!("control-plane-client-re-attach");
-
        let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
        tracing::info!(
            "Received re-attach response with {} tenants",
@@ -170,8 +168,6 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                .collect(),
        };

-        fail::fail_point!("control-plane-client-validate");
-
        let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;

        Ok(response
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -40,6 +40,7 @@ use validator::ValidatorQueueMessage;

 use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName};

+// TODO: adminstrative "panic button" config property to disable all deletions
 // TODO: configurable for how long to wait before executing deletions

 /// We aggregate object deletions from many tenants in one place, for several reasons:
@@ -153,7 +154,7 @@ impl FlushOp {

 #[derive(Clone, Debug)]
 pub struct DeletionQueueClient {
-    tx: tokio::sync::mpsc::UnboundedSender<ListWriterQueueMessage>,
+    tx: tokio::sync::mpsc::Sender<ListWriterQueueMessage>,
    executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,

    lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
@@ -185,7 +186,7 @@ where
    V: Serialize,
    I: AsRef<[u8]>,
 {
-    let transformed = input.iter().map(|(k, v)| (hex::encode(k), v));
+    let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone()));

    transformed
        .collect::<HashMap<String, &V>>()
@@ -212,7 +213,7 @@ where

 /// Files ending with this suffix will be ignored and erased
 /// during recovery as startup.
-const TEMP_SUFFIX: &str = "tmp";
+const TEMP_SUFFIX: &str = ".tmp";

 #[serde_as]
 #[derive(Debug, Serialize, Deserialize)]
@@ -324,7 +325,10 @@ impl DeletionList {
            return false;
        }

-        let timeline_entry = tenant_entry.timelines.entry(*timeline).or_default();
+        let timeline_entry = tenant_entry
+            .timelines
+            .entry(*timeline)
+            .or_insert_with(Vec::new);

        let timeline_remote_path = remote_timeline_path(tenant, timeline);

@@ -416,7 +420,7 @@ pub enum DeletionQueueError {
 impl DeletionQueueClient {
    pub(crate) fn broken() -> Self {
        // Channels whose receivers are immediately dropped.
-        let (tx, _rx) = tokio::sync::mpsc::unbounded_channel();
+        let (tx, _rx) = tokio::sync::mpsc::channel(1);
        let (executor_tx, _executor_rx) = tokio::sync::mpsc::channel(1);
        Self {
            tx,
@@ -428,12 +432,12 @@ impl DeletionQueueClient {
    /// This is cancel-safe.  If you drop the future before it completes, the message
    /// is not pushed, although in the context of the deletion queue it doesn't matter: once
    /// we decide to do a deletion the decision is always final.
-    fn do_push<T>(
+    async fn do_push<T>(
        &self,
-        queue: &tokio::sync::mpsc::UnboundedSender<T>,
+        queue: &tokio::sync::mpsc::Sender<T>,
        msg: T,
    ) -> Result<(), DeletionQueueError> {
-        match queue.send(msg) {
+        match queue.send(msg).await {
            Ok(_) => Ok(()),
            Err(e) => {
                // This shouldn't happen, we should shut down all tenants before
@@ -445,7 +449,7 @@ impl DeletionQueueClient {
        }
    }

-    pub(crate) fn recover(
+    pub(crate) async fn recover(
        &self,
        attached_tenants: HashMap<TenantId, Generation>,
    ) -> Result<(), DeletionQueueError> {
@@ -453,6 +457,7 @@ impl DeletionQueueClient {
            &self.tx,
            ListWriterQueueMessage::Recover(RecoverOp { attached_tenants }),
        )
+        .await
    }

    /// When a Timeline wishes to update the remote_consistent_lsn that it exposes to the outside
@@ -525,21 +530,6 @@ impl DeletionQueueClient {
            return self.flush_immediate().await;
        }

-        self.push_layers_sync(tenant_id, timeline_id, current_generation, layers)
-    }
-
-    /// When a Tenant has a generation, push_layers is always synchronous because
-    /// the ListValidator channel is an unbounded channel.
-    ///
-    /// This can be merged into push_layers when we remove the Generation-less mode
-    /// support (`<https://github.com/neondatabase/neon/issues/5395>`)
-    pub(crate) fn push_layers_sync(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        current_generation: Generation,
-        layers: Vec<(LayerFileName, Generation)>,
-    ) -> Result<(), DeletionQueueError> {
        metrics::DELETION_QUEUE
            .keys_submitted
            .inc_by(layers.len() as u64);
@@ -553,16 +543,17 @@ impl DeletionQueueClient {
                objects: Vec::new(),
            }),
        )
+        .await
    }

    /// This is cancel-safe.  If you drop the future the flush may still happen in the background.
    async fn do_flush<T>(
        &self,
-        queue: &tokio::sync::mpsc::UnboundedSender<T>,
+        queue: &tokio::sync::mpsc::Sender<T>,
        msg: T,
        rx: tokio::sync::oneshot::Receiver<()>,
    ) -> Result<(), DeletionQueueError> {
-        self.do_push(queue, msg)?;
+        self.do_push(queue, msg).await?;
        if rx.await.is_err() {
            // This shouldn't happen if tenants are shut down before deletion queue.  If we
            // encounter a bug like this, then a flusher will incorrectly believe it has flushed
@@ -583,18 +574,6 @@ impl DeletionQueueClient {
            .await
    }

-    /// Issue a flush without waiting for it to complete.  This is useful on advisory flushes where
-    /// the caller wants to avoid the risk of waiting for lots of enqueued work, such as on tenant
-    /// detach where flushing is nice but not necessary.
-    ///
-    /// This function provides no guarantees of work being done.
-    pub fn flush_advisory(&self) {
-        let (flush_op, _) = FlushOp::new();
-
-        // Transmit the flush message, ignoring any result (such as a closed channel during shutdown).
-        drop(self.tx.send(ListWriterQueueMessage::FlushExecute(flush_op)));
-    }
-
    // Wait until all previous deletions are executed
    pub(crate) async fn flush_execute(&self) -> Result<(), DeletionQueueError> {
        debug!("flush_execute: flushing to deletion lists...");
@@ -611,7 +590,9 @@ impl DeletionQueueClient {
        // Flush any immediate-mode deletions (the above backend flush will only flush
        // the executor if deletions had flowed through the backend)
        debug!("flush_execute: flushing execution...");
-        self.flush_immediate().await?;
+        let (flush_op, rx) = FlushOp::new();
+        self.do_flush(&self.executor_tx, DeleterMessage::Flush(flush_op), rx)
+            .await?;
        debug!("flush_execute: finished flushing execution...");
        Ok(())
    }
@@ -666,10 +647,8 @@ impl DeletionQueue {
    where
        C: ControlPlaneGenerationsApi + Send + Sync,
    {
-        // Unbounded channel: enables non-async functions to submit deletions.  The actual length is
-        // constrained by how promptly the ListWriter wakes up and drains it, which should be frequent
-        // enough to avoid this taking pathologically large amount of memory.
-        let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
+        // Deep channel: it consumes deletions from all timelines and we do not want to block them
+        let (tx, rx) = tokio::sync::mpsc::channel(16384);

        // Shallow channel: it carries DeletionLists which each contain up to thousands of deletions
        let (backend_tx, backend_rx) = tokio::sync::mpsc::channel(16);
@@ -982,7 +961,7 @@ mod test {
        // Basic test that the deletion queue processes the deletions we pass into it
        let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
-        client.recover(HashMap::new())?;
+        client.recover(HashMap::new()).await?;

        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
        let tenant_id = ctx.harness.tenant_id;
@@ -1050,7 +1029,7 @@ mod test {
    async fn deletion_queue_validation() -> anyhow::Result<()> {
        let ctx = setup("deletion_queue_validation").expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
-        client.recover(HashMap::new())?;
+        client.recover(HashMap::new()).await?;

        // Generation that the control plane thinks is current
        let latest_generation = Generation::new(0xdeadbeef);
@@ -1107,7 +1086,7 @@ mod test {
        // Basic test that the deletion queue processes the deletions we pass into it
        let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
-        client.recover(HashMap::new())?;
+        client.recover(HashMap::new()).await?;

        let tenant_id = ctx.harness.tenant_id;

@@ -1170,7 +1149,9 @@ mod test {
        drop(client);
        ctx.restart().await;
        let client = ctx.deletion_queue.new_client();
-        client.recover(HashMap::from([(tenant_id, now_generation)]))?;
+        client
+            .recover(HashMap::from([(tenant_id, now_generation)]))
+            .await?;

        info!("Flush-executing");
        client.flush_execute().await?;
@@ -1196,7 +1177,7 @@ pub(crate) mod mock {
    };

    pub struct ConsumerState {
-        rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
+        rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
        executor_rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
    }

@@ -1273,7 +1254,7 @@ pub(crate) mod mock {
    }

    pub struct MockDeletionQueue {
-        tx: tokio::sync::mpsc::UnboundedSender<ListWriterQueueMessage>,
+        tx: tokio::sync::mpsc::Sender<ListWriterQueueMessage>,
        executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,
        executed: Arc<AtomicUsize>,
        remote_storage: Option<GenericRemoteStorage>,
@@ -1283,7 +1264,7 @@ pub(crate) mod mock {

    impl MockDeletionQueue {
        pub fn new(remote_storage: Option<GenericRemoteStorage>) -> Self {
-            let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
+            let (tx, rx) = tokio::sync::mpsc::channel(16384);
            let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16384);

            let executed = Arc::new(AtomicUsize::new(0));
@@ -1298,6 +1279,10 @@ pub(crate) mod mock {
            }
        }

+        pub fn get_executed(&self) -> usize {
+            self.executed.load(Ordering::Relaxed)
+        }
+
        #[allow(clippy::await_holding_lock)]
        pub async fn pump(&self) {
            if let Some(remote_storage) = &self.remote_storage {
--- a/pageserver/src/deletion_queue/check.log
+++ b/pageserver/src/deletion_queue/check.log
@@ -1,2 +0,0 @@
-    Checking pageserver v0.1.0 (/home/neon/neon/pageserver)
-    Finished dev [optimized + debuginfo] target(s) in 7.62s
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -13,7 +13,6 @@ use std::time::Duration;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
 use tracing::warn;
-use utils::backoff;

 use crate::metrics;

@@ -64,19 +63,7 @@ impl Deleter {
            Err(anyhow::anyhow!("failpoint hit"))
        });

-        // A backoff::retry is used here for two reasons:
-        // - To provide a backoff rather than busy-polling the API on errors
-        // - To absorb transient 429/503 conditions without hitting our error
-        //   logging path for issues deleting objects.
-        backoff::retry(
-            || async { self.remote_storage.delete_objects(&self.accumulator).await },
-            |_| false,
-            3,
-            10,
-            "executing deletion batch",
-            backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Shutting down")),
-        )
-        .await
+        self.remote_storage.delete_objects(&self.accumulator).await
    }

    /// Block until everything in accumulator has been executed
@@ -101,10 +88,7 @@ impl Deleter {
                    self.accumulator.clear();
                }
                Err(e) => {
-                    if self.cancel.is_cancelled() {
-                        return Err(DeletionQueueError::ShuttingDown);
-                    }
-                    warn!("DeleteObjects request failed: {e:#}, will continue trying");
+                    warn!("DeleteObjects request failed: {e:#}, will retry");
                    metrics::DELETION_QUEUE
                        .remote_errors
                        .with_label_values(&["execute"])
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -85,7 +85,7 @@ pub(super) struct ListWriter {
    conf: &'static PageServerConf,

    // Incoming frontend requests to delete some keys
-    rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
+    rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,

    // Outbound requests to the backend to execute deletion lists we have composed.
    tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
@@ -111,7 +111,7 @@ impl ListWriter {

    pub(super) fn new(
        conf: &'static PageServerConf,
-        rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
+        rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
        tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
        cancel: CancellationToken,
    ) -> Self {
@@ -230,7 +230,6 @@ impl ListWriter {
        let list_name_pattern =
            Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();

-        let temp_extension = format!(".{TEMP_SUFFIX}");
        let header_path = self.conf.deletion_header_path();
        let mut seqs: Vec<u64> = Vec::new();
        while let Some(dentry) = dir.next_entry().await? {
@@ -242,7 +241,7 @@ impl ListWriter {
                continue;
            }

-            if dentry_str.ends_with(&temp_extension) {
+            if dentry_str.ends_with(TEMP_SUFFIX) {
                info!("Cleaning up temporary file {dentry_str}");
                let absolute_path =
                    deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path"));
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -220,8 +220,6 @@ where
                    warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
                    metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
                    mutated = true;
-                } else {
-                    metrics::DELETION_QUEUE.keys_validated.inc_by(tenant.len() as u64);
                }
                this_list_valid
            });
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -48,26 +48,19 @@ use std::{
 };

 use anyhow::Context;
+use camino::Utf8Path;
 use remote_storage::GenericRemoteStorage;
 use serde::{Deserialize, Serialize};
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, instrument, warn, Instrument};
-use utils::{
-    completion,
-    id::{TenantId, TenantTimelineId},
-};
-use utils::{id::TimelineId, serde_percent::Percent};
+use utils::completion;
+use utils::serde_percent::Percent;

 use crate::{
    config::PageServerConf,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
-    tenant::{
-        mgr::TenantManager,
-        secondary::SecondaryTenant,
-        storage_layer::{AsLayerDesc, EvictionError, Layer},
-        Timeline,
-    },
+    tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
 };

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -90,7 +83,6 @@ pub fn launch_disk_usage_global_eviction_task(
    conf: &'static PageServerConf,
    storage: GenericRemoteStorage,
    state: Arc<State>,
-    tenant_manager: Arc<TenantManager>,
    background_jobs_barrier: completion::Barrier,
 ) -> anyhow::Result<()> {
    let Some(task_config) = &conf.disk_usage_based_eviction else {
@@ -116,7 +108,8 @@ pub fn launch_disk_usage_global_eviction_task(
                _ = background_jobs_barrier.wait() => { }
            };

-            disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel).await;
+            disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
+                .await;
            Ok(())
        },
    );
@@ -128,8 +121,8 @@ pub fn launch_disk_usage_global_eviction_task(
 async fn disk_usage_eviction_task(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    _storage: &GenericRemoteStorage,
-    tenant_manager: Arc<TenantManager>,
+    storage: GenericRemoteStorage,
+    tenants_dir: &Utf8Path,
    cancel: CancellationToken,
 ) {
    scopeguard::defer! {
@@ -152,9 +145,14 @@ async fn disk_usage_eviction_task(
        let start = Instant::now();

        async {
-            let res =
-                disk_usage_eviction_task_iteration(state, task_config, &tenant_manager, &cancel)
-                    .await;
+            let res = disk_usage_eviction_task_iteration(
+                state,
+                task_config,
+                &storage,
+                tenants_dir,
+                &cancel,
+            )
+            .await;

            match res {
                Ok(()) => {}
@@ -185,14 +183,13 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
 async fn disk_usage_eviction_task_iteration(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    tenant_manager: &Arc<TenantManager>,
+    storage: &GenericRemoteStorage,
+    tenants_dir: &Utf8Path,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
-    let tenants_dir = tenant_manager.get_conf().tenants_path();
-    let usage_pre = filesystem_level_usage::get(&tenants_dir, task_config)
+    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
        .context("get filesystem-level disk usage before evictions")?;
-    let res =
-        disk_usage_eviction_task_iteration_impl(state, usage_pre, tenant_manager, cancel).await;
+    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
    match res {
        Ok(outcome) => {
            debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -202,7 +199,7 @@ async fn disk_usage_eviction_task_iteration(
                }
                IterationOutcome::Finished(outcome) => {
                    // Verify with statvfs whether we made any real progress
-                    let after = filesystem_level_usage::get(&tenants_dir, task_config)
+                    let after = filesystem_level_usage::get(tenants_dir, task_config)
                        // It's quite unlikely to hit the error here. Keep the code simple and bail out.
                        .context("get filesystem-level disk usage after evictions")?;

@@ -276,8 +273,8 @@ struct LayerCount {

 pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    state: &State,
+    storage: &GenericRemoteStorage,
    usage_pre: U,
-    tenant_manager: &Arc<TenantManager>,
    cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
    // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
@@ -297,7 +294,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
        "running disk usage based eviction due to pressure"
    );

-    let candidates = match collect_eviction_candidates(tenant_manager, cancel).await? {
+    let candidates = match collect_eviction_candidates(cancel).await? {
        EvictionCandidates::Cancelled => {
            return Ok(IterationOutcome::Cancelled);
        }
@@ -333,16 +330,9 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // If we get far enough in the list that we start to evict layers that are below
    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
    // usage at that point, in 'usage_planned_min_resident_size_respecting'.
-
-    // Evictions for attached tenants, batched by timeline
-    let mut batched: HashMap<_, Vec<_>> = HashMap::new();
-
-    // Evictions for secondary locations, batched by tenant
-    let mut secondary_by_tenant: HashMap<TenantId, Vec<(TimelineId, Layer)>> = HashMap::new();
-
+    let mut batched: HashMap<_, Vec<Arc<dyn PersistentLayer>>> = HashMap::new();
    let mut warned = None;
    let mut usage_planned = usage_pre;
-    let mut max_batch_size = 0;
    for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
        if !usage_planned.has_pressure() {
            debug!(
@@ -359,26 +349,10 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

        usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);

-        // FIXME: batching makes no sense anymore because of no layermap locking, should just spawn
-        // tasks to evict all seen layers until we have evicted enough
-
-        match candidate.source {
-            EvictionCandidateSource::Attached(timeline) => {
-                let batch = batched.entry(TimelineKey(timeline)).or_default();
-
-                // semaphore will later be used to limit eviction concurrency, and we can express at
-                // most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
-                // but fail gracefully by not making batches larger.
-                if batch.len() < u32::MAX as usize {
-                    batch.push(candidate.layer);
-                    max_batch_size = max_batch_size.max(batch.len());
-                }
-            }
-            EvictionCandidateSource::Secondary(ttid) => {
-                let batch = secondary_by_tenant.entry(ttid.tenant_id).or_default();
-                batch.push((ttid.timeline_id, candidate.layer));
-            }
-        }
+        batched
+            .entry(TimelineKey(candidate.timeline))
+            .or_default()
+            .push(candidate.layer);
    }

    let usage_planned = match warned {
@@ -393,116 +367,66 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    };
    debug!(?usage_planned, "usage planned");

-    // phase2 (secondary tenants): evict victims batched by tenant
-    for (tenant_id, timeline_layers) in secondary_by_tenant {
-        // Q: Why do we go via TenantManager again rather than just deleting files, or keeping
-        // an Arc ref to the secondary state?
-        // A: It's because a given tenant's local storage **belongs** to whoever is currently
-        // live in the TenantManager.  We must avoid a race where we might plan an eviction
-        // for secondary, and then execute it when the tenant is actually in an attached state.
-        tenant_manager
-            .evict_tenant_layers(&tenant_id, timeline_layers)
-            .instrument(tracing::info_span!("evict_batch", %tenant_id))
-            .await;
-    }
-
-    // phase2 (attached tenants): evict victims batched by timeline
-
-    let mut js = tokio::task::JoinSet::new();
-
-    // ratelimit to 1k files or any higher max batch size
-    let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
+    // phase2: evict victims batched by timeline

+    // After the loop, `usage_assumed` is the post-eviction usage,
+    // according to internal accounting.
+    let mut usage_assumed = usage_pre;
+    let mut evictions_failed = LayerCount::default();
    for (timeline, batch) in batched {
        let tenant_id = timeline.tenant_id;
        let timeline_id = timeline.timeline_id;
-        let batch_size =
-            u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
-
-        // I dislike naming of `available_permits` but it means current total amount of permits
-        // because permits can be added
-        assert!(batch_size as usize <= limit.available_permits());
+        let batch_size = batch.len();

        debug!(%timeline_id, "evicting batch for timeline");

-        let evict = {
-            let limit = limit.clone();
-            let cancel = cancel.clone();
-            async move {
-                let mut evicted_bytes = 0;
-                let mut evictions_failed = LayerCount::default();
+        async {
+            let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;

-                let Ok(_permit) = limit.acquire_many_owned(batch_size).await else {
-                    // semaphore closing means cancelled
-                    return (evicted_bytes, evictions_failed);
-                };
-
-                let results = timeline.evict_layers(&batch, &cancel).await;
-
-                match results {
-                    Ok(results) => {
-                        assert_eq!(results.len(), batch.len());
-                        for (result, layer) in results.into_iter().zip(batch.iter()) {
-                            let file_size = layer.layer_desc().file_size;
-                            match result {
-                                Some(Ok(())) => {
-                                    evicted_bytes += file_size;
-                                }
-                                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
-                                    evictions_failed.file_sizes += file_size;
-                                    evictions_failed.count += 1;
-                                }
-                                None => {
-                                    assert!(cancel.is_cancelled());
-                                }
+            match results {
+                Err(e) => {
+                    warn!("failed to evict batch: {:#}", e);
+                }
+                Ok(results) => {
+                    assert_eq!(results.len(), batch.len());
+                    for (result, layer) in results.into_iter().zip(batch.iter()) {
+                        let file_size = layer.layer_desc().file_size;
+                        match result {
+                            Some(Ok(())) => {
+                                usage_assumed.add_available_bytes(file_size);
+                            }
+                            Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
+                                unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
+                            }
+                            Some(Err(EvictionError::FileNotFound)) => {
+                                evictions_failed.file_sizes += file_size;
+                                evictions_failed.count += 1;
+                            }
+                            Some(Err(
+                                e @ EvictionError::LayerNotFound(_)
+                                | e @ EvictionError::StatFailed(_),
+                            )) => {
+                                let e = utils::error::report_compact_sources(&e);
+                                warn!(%layer, "failed to evict layer: {e}");
+                                evictions_failed.file_sizes += file_size;
+                                evictions_failed.count += 1;
+                            }
+                            None => {
+                                assert!(cancel.is_cancelled());
+                                return;
                            }
                        }
                    }
-                    Err(e) => {
-                        warn!("failed to evict batch: {:#}", e);
-                    }
                }
-                (evicted_bytes, evictions_failed)
            }
        }
-        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size));
+        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
+        .await;

-        js.spawn(evict);
-
-        // spwaning multiple thousands of these is essentially blocking, so give already spawned a
-        // chance of making progress
-        tokio::task::yield_now().await;
-    }
-
-    let join_all = async move {
-        // After the evictions, `usage_assumed` is the post-eviction usage,
-        // according to internal accounting.
-        let mut usage_assumed = usage_pre;
-        let mut evictions_failed = LayerCount::default();
-
-        while let Some(res) = js.join_next().await {
-            match res {
-                Ok((evicted_bytes, failed)) => {
-                    usage_assumed.add_available_bytes(evicted_bytes);
-                    evictions_failed.file_sizes += failed.file_sizes;
-                    evictions_failed.count += failed.count;
-                }
-                Err(je) if je.is_cancelled() => unreachable!("not used"),
-                Err(je) if je.is_panic() => { /* already logged */ }
-                Err(je) => tracing::error!("unknown JoinError: {je:?}"),
-            }
-        }
-        (usage_assumed, evictions_failed)
-    };
-
-    let (usage_assumed, evictions_failed) = tokio::select! {
-        tuple = join_all => { tuple },
-        _ = cancel.cancelled() => {
-            // close the semaphore to stop any pending acquires
-            limit.close();
+        if cancel.is_cancelled() {
            return Ok(IterationOutcome::Cancelled);
        }
-    };
+    }

    Ok(IterationOutcome::Finished(IterationOutcomeFinished {
        before: usage_pre,
@@ -514,19 +438,10 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    }))
 }

-// An eviction candidate might originate from either an attached tenant
-// with a [`Tenant`] and [`Timeline`] object, or from a secondary tenant
-// location.  These differ in how we will execute the eviction.
-#[derive(Clone)]
-enum EvictionCandidateSource {
-    Attached(Arc<Timeline>),
-    Secondary(TenantTimelineId),
-}
-
 #[derive(Clone)]
 struct EvictionCandidate {
-    source: EvictionCandidateSource,
-    layer: Layer,
+    timeline: Arc<Timeline>,
+    layer: Arc<dyn PersistentLayer>,
    last_activity_ts: SystemTime,
 }

@@ -575,18 +490,27 @@ enum EvictionCandidates {
 /// after exhauting the `Above` partition.
 /// So, we did not respect each tenant's min_resident_size.
 async fn collect_eviction_candidates(
-    tenant_manager: &Arc<TenantManager>,
    cancel: &CancellationToken,
 ) -> anyhow::Result<EvictionCandidates> {
    // get a snapshot of the list of tenants
+    let tenants = tenant::mgr::list_tenants()
+        .await
+        .context("get list of tenants")?;
+
    let mut candidates = Vec::new();

-    let tenants = tenant_manager.get_attached_tenants();
-
-    for tenant in tenants {
+    for (tenant_id, _state) in &tenants {
        if cancel.is_cancelled() {
            return Ok(EvictionCandidates::Cancelled);
        }
+        let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await {
+            Ok(tenant) => tenant,
+            Err(e) => {
+                // this can happen if tenant has lifecycle transition after we fetched it
+                debug!("failed to get tenant: {e:#}");
+                continue;
+            }
+        };

        // collect layers from all timelines in this tenant
        //
@@ -649,7 +573,7 @@ async fn collect_eviction_candidates(
        for (timeline, layer_info) in tenant_candidates.into_iter() {
            let file_size = layer_info.file_size();
            let candidate = EvictionCandidate {
-                source: EvictionCandidateSource::Attached(timeline),
+                timeline,
                last_activity_ts: layer_info.last_activity_ts,
                layer: layer_info.layer,
            };
@@ -663,43 +587,6 @@ async fn collect_eviction_candidates(
        }
    }

-    // FIXME: this is a long loop over all secondary locations.  At the least, respect
-    // cancellation here, but really we need to break up the loop.  We could extract the
-    // Arc<SecondaryTenant>s and iterate over them with some tokio yields in there.  Ideally
-    // though we should just reduce the total amount of work: our eviction goals do not require
-    // listing absolutely every layer in every tenant: we could sample this.
-    tenant_manager.foreach_secondary_tenants(
-        |tenant_id: &TenantId, state: &Arc<SecondaryTenant>| {
-        let mut tenant_candidates = Vec::new();
-        for (timeline_id, layer_info) in state.get_layers_for_eviction() {
-            debug!(tenant_id=%tenant_id, timeline_id=%timeline_id, "timeline resident layers (secondary) count: {}", layer_info.resident_layers.len());
-            tenant_candidates.extend(
-                layer_info.resident_layers
-                    .into_iter()
-                    .map(|layer_infos| (timeline_id, layer_infos)),
-            );
-        }
-
-        tenant_candidates
-            .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
-
-        candidates.extend(tenant_candidates.into_iter().map(|(timeline_id, candidate)| {
-            (
-                // Secondary locations' layers are always considered above the min resident size,
-                // i.e. secondary locations are permitted to be trimmed to zero layers if all
-                // the layers have sufficiently old access times.
-                MinResidentSizePartition::Above,
-                    EvictionCandidate {
-                    source: EvictionCandidateSource::Secondary(TenantTimelineId { tenant_id: *tenant_id, timeline_id}),
-                    last_activity_ts: candidate.last_activity_ts,
-                    layer: candidate.layer,
-                }
-            )
-        }));
-
-        },
-    );
-
    debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
        "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
    candidates
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -93,16 +93,9 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
    delete:
      description: |
-        Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
+        Attempts to delete specified tenant. 500 and 409 errors should be retried until 404 is retrieved.
        404 means that deletion successfully finished"
      responses:
        "400":
@@ -141,13 +134,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-

  /v1/tenant/{tenant_id}/timeline:
    parameters:
@@ -192,13 +178,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-

  /v1/tenant/{tenant_id}/timeline/{timeline_id}:
    parameters:
@@ -247,13 +226,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
    delete:
      description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
      responses:
@@ -293,74 +265,7 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/PreconditionFailedError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"

-  /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-      - name: timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-    get:
-      description: Get timestamp for a given LSN
-      parameters:
-        - name: lsn
-          in: query
-          required: true
-          schema:
-            type: integer
-          description: A LSN to get the timestamp
-      responses:
-        "200":
-          description: OK
-          content:
-            application/json:
-              schema:
-                type: string
-                format: date-time
-        "400":
-          description: Error when no tenant id found in path, no timeline id or invalid timestamp
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "404":
-          description: Timeline not found, or there is no timestamp information for the given lsn
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
        "500":
          description: Generic operation error
          content:
@@ -392,19 +297,13 @@ paths:
            type: string
            format: date-time
          description: A timestamp to get the LSN
-        - name: version
-          in: query
-          required: false
-          schema:
-            type: integer
-          description: The version of the endpoint to use
      responses:
        "200":
          description: OK
          content:
            application/json:
              schema:
-                $ref: "#/components/schemas/LsnByTimestampResponse"
+                type: string
        "400":
          description: Error when no tenant id found in path, no timeline id or invalid timestamp
          content:
@@ -429,13 +328,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
  /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
    parameters:
      - name: tenant_id
@@ -483,13 +375,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
  /v1/tenant/{tenant_id}/attach:
    parameters:
      - name: tenant_id
@@ -580,13 +465,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-

  /v1/tenant/{tenant_id}/detach:
    parameters:
@@ -640,13 +518,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-

  /v1/tenant/{tenant_id}/ignore:
    parameters:
@@ -689,13 +560,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-

  /v1/tenant/{tenant_id}/load:
    parameters:
@@ -740,13 +604,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-

  /v1/tenant/{tenant_id}/synthetic_size:
    parameters:
@@ -784,12 +641,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"

  /v1/tenant/{tenant_id}/size:
    parameters:
@@ -853,13 +704,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-

  /v1/tenant/{tenant_id}/timeline/:
    parameters:
@@ -936,13 +780,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
  /v1/tenant/:
    get:
      description: Get tenants list
@@ -973,13 +810,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
    post:
      description: |
        Create a tenant. Returns new tenant id on success.
@@ -1030,13 +860,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-

  /v1/tenant/config:
    put:
@@ -1082,13 +905,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
  /v1/tenant/{tenant_id}/config/:
    parameters:
      - name: tenant_id
@@ -1138,13 +954,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
 components:
  securitySchemes:
    JWT:
@@ -1390,19 +1199,6 @@ components:
          type: string
          format: hex

-    LsnByTimestampResponse:
-      type: object
-      required:
-        - lsn
-        - kind
-      properties:
-        lsn:
-          type: string
-          format: hex
-        kind:
-          type: string
-          enum: [past, present, future, nodata]
-
    Error:
      type: object
      required:
@@ -1424,13 +1220,6 @@ components:
      properties:
        msg:
          type: string
-    ServiceUnavailableError:
-      type: object
-      required:
-        - msg
-      properties:
-        msg:
-          type: string
    NotFoundError:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2,14 +2,10 @@
 //! Management HTTP API
 //!
 use std::collections::HashMap;
-use std::str::FromStr;
 use std::sync::Arc;
-use std::time::Duration;

 use anyhow::{anyhow, Context, Result};
 use futures::TryFutureExt;
-use humantime::format_rfc3339;
-use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
@@ -18,7 +14,6 @@ use pageserver_api::models::{
    TenantLoadRequest, TenantLocationConfigRequest,
 };
 use remote_storage::GenericRemoteStorage;
-use serde_with::{serde_as, DisplayFromStr};
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -37,10 +32,8 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
 use crate::tenant::mgr::{
-    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
-    TenantSlotError, TenantSlotUpsertError, TenantStateError,
+    GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
 };
-use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::timeline::Timeline;
@@ -66,42 +59,35 @@ use super::models::ConfigureFailpointsRequest;

 pub struct State {
    conf: &'static PageServerConf,
-    tenant_manager: Arc<TenantManager>,
    auth: Option<Arc<JwtAuth>>,
    allowlist_routes: Vec<Uri>,
    remote_storage: Option<GenericRemoteStorage>,
    broker_client: storage_broker::BrokerClientChannel,
    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
    deletion_queue_client: DeletionQueueClient,
-    secondary_controller: SecondaryController,
 }

 impl State {
-    #[allow(clippy::too_many_arguments)]
    pub fn new(
        conf: &'static PageServerConf,
-        tenant_manager: Arc<TenantManager>,
        auth: Option<Arc<JwtAuth>>,
        remote_storage: Option<GenericRemoteStorage>,
        broker_client: storage_broker::BrokerClientChannel,
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
        deletion_queue_client: DeletionQueueClient,
-        secondary_controller: SecondaryController,
    ) -> anyhow::Result<Self> {
-        let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"]
+        let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
            .iter()
            .map(|v| v.parse().unwrap())
            .collect::<Vec<_>>();
        Ok(Self {
            conf,
-            tenant_manager,
            auth,
            allowlist_routes,
            remote_storage,
            broker_client,
            disk_usage_eviction_state,
            deletion_queue_client,
-            secondary_controller,
        })
    }

@@ -147,9 +133,11 @@ impl From<PageReconstructError> for ApiError {
                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
            }
            PageReconstructError::AncestorStopping(_) => {
-                ApiError::ResourceUnavailable(format!("{pre}").into())
+                ApiError::ResourceUnavailable(format!("{pre}"))
+            }
+            PageReconstructError::WalRedo(pre) => {
+                ApiError::InternalServerError(anyhow::Error::new(pre))
            }
-            PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
        }
    }
 }
@@ -157,60 +145,31 @@ impl From<PageReconstructError> for ApiError {
 impl From<TenantMapInsertError> for ApiError {
    fn from(tmie: TenantMapInsertError) -> ApiError {
        match tmie {
-            TenantMapInsertError::SlotError(e) => e.into(),
-            TenantMapInsertError::SlotUpsertError(e) => e.into(),
+            TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
+                ApiError::ResourceUnavailable(format!("{tmie}"))
+            }
+            TenantMapInsertError::TenantAlreadyExists(id, state) => {
+                ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
+            }
+            TenantMapInsertError::TenantExistsSecondary(id) => {
+                ApiError::Conflict(format!("tenant {id} already exists as secondary"))
+            }
            TenantMapInsertError::Other(e) => ApiError::InternalServerError(e),
        }
    }
 }

-impl From<TenantSlotError> for ApiError {
-    fn from(e: TenantSlotError) -> ApiError {
-        use TenantSlotError::*;
-        match e {
-            NotFound(tenant_id) => {
-                ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
-            }
-            e @ AlreadyExists(_, _) => ApiError::Conflict(format!("{e}")),
-            e @ Conflict(_) => ApiError::Conflict(format!("{e}")),
-            InProgress => {
-                ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
-            }
-            MapState(e) => e.into(),
-        }
-    }
-}
-
-impl From<TenantSlotUpsertError> for ApiError {
-    fn from(e: TenantSlotUpsertError) -> ApiError {
-        use TenantSlotUpsertError::*;
-        match e {
-            InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")),
-            MapState(e) => e.into(),
-        }
-    }
-}
-
-impl From<TenantMapError> for ApiError {
-    fn from(e: TenantMapError) -> ApiError {
-        use TenantMapError::*;
-        match e {
-            StillInitializing | ShuttingDown => {
-                ApiError::ResourceUnavailable(format!("{e}").into())
-            }
-        }
-    }
-}
-
 impl From<TenantStateError> for ApiError {
    fn from(tse: TenantStateError) -> ApiError {
        match tse {
+            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
+            TenantStateError::NotActive(_) => {
+                ApiError::ResourceUnavailable("Tenant not yet active".into())
+            }
            TenantStateError::IsStopping(_) => {
                ApiError::ResourceUnavailable("Tenant is stopping".into())
            }
-            TenantStateError::SlotError(e) => e.into(),
-            TenantStateError::SlotUpsertError(e) => e.into(),
-            TenantStateError::Other(e) => ApiError::InternalServerError(anyhow!(e)),
+            _ => ApiError::InternalServerError(anyhow::Error::new(tse)),
        }
    }
 }
@@ -285,9 +244,6 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
            Get(g) => ApiError::from(g),
            e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
            Timeline(t) => ApiError::from(t),
-            NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()),
-            SlotError(e) => e.into(),
-            SlotUpsertError(e) => e.into(),
            Other(o) => ApiError::InternalServerError(o),
            e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
        }
@@ -414,7 +370,7 @@ async fn timeline_create_handler(
    let state = get_state(&request);

    async {
-        let tenant = mgr::get_tenant(tenant_id, true)?;
+        let tenant = mgr::get_tenant(tenant_id, true).await?;
        match tenant.create_timeline(
            new_timeline_id,
            request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -439,9 +395,6 @@ async fn timeline_create_handler(
                    format!("{err:#}")
                ))
            }
-            Err(e @ tenant::CreateTimelineError::AncestorNotActive) => {
-                json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()))
-            }
            Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
        }
    }
@@ -461,7 +414,7 @@ async fn timeline_list_handler(
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

    let response_data = async {
-        let tenant = mgr::get_tenant(tenant_id, true)?;
+        let tenant = mgr::get_tenant(tenant_id, true).await?;
        let timelines = tenant.list_timelines();

        let mut response_data = Vec::with_capacity(timelines.len());
@@ -500,7 +453,7 @@ async fn timeline_detail_handler(
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

    let timeline_info = async {
-        let tenant = mgr::get_tenant(tenant_id, true)?;
+        let tenant = mgr::get_tenant(tenant_id, true).await?;

        let timeline = tenant
            .get_timeline(timeline_id, false)
@@ -530,8 +483,6 @@ async fn get_lsn_by_timestamp_handler(
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let version: Option<u8> = parse_query_param(&request, "version")?;
-
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let timestamp_raw = must_get_query_param(&request, "timestamp")?;
    let timestamp = humantime::parse_rfc3339(&timestamp_raw)
@@ -543,59 +494,13 @@ async fn get_lsn_by_timestamp_handler(
    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
    let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;

-    if version.unwrap_or(0) > 1 {
-        #[serde_as]
-        #[derive(serde::Serialize)]
-        struct Result {
-            #[serde_as(as = "DisplayFromStr")]
-            lsn: Lsn,
-            kind: &'static str,
-        }
-        let (lsn, kind) = match result {
-            LsnForTimestamp::Present(lsn) => (lsn, "present"),
-            LsnForTimestamp::Future(lsn) => (lsn, "future"),
-            LsnForTimestamp::Past(lsn) => (lsn, "past"),
-            LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
-        };
-        json_response(StatusCode::OK, Result { lsn, kind })
-    } else {
-        // FIXME: this is a temporary crutch not to break backwards compatibility
-        // See https://github.com/neondatabase/neon/pull/5608
-        let result = match result {
-            LsnForTimestamp::Present(lsn) => format!("{lsn}"),
-            LsnForTimestamp::Future(_lsn) => "future".into(),
-            LsnForTimestamp::Past(_lsn) => "past".into(),
-            LsnForTimestamp::NoData(_lsn) => "nodata".into(),
-        };
-        json_response(StatusCode::OK, result)
-    }
-}
-
-async fn get_timestamp_of_lsn_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
-
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-
-    let lsn_str = must_get_query_param(&request, "lsn")?;
-    let lsn = Lsn::from_str(&lsn_str)
-        .with_context(|| format!("Invalid LSN: {lsn_str:?}"))
-        .map_err(ApiError::BadRequest)?;
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
-    let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;
-
-    match result {
-        Some(time) => {
-            let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
-            json_response(StatusCode::OK, time)
-        }
-        None => json_response(StatusCode::NOT_FOUND, ()),
-    }
+    let result = match result {
+        LsnForTimestamp::Present(lsn) => format!("{lsn}"),
+        LsnForTimestamp::Future(_lsn) => "future".into(),
+        LsnForTimestamp::Past(_lsn) => "past".into(),
+        LsnForTimestamp::NoData(_lsn) => "nodata".into(),
+    };
+    json_response(StatusCode::OK, result)
 }

 async fn tenant_attach_handler(
@@ -666,14 +571,9 @@ async fn tenant_detach_handler(

    let state = get_state(&request);
    let conf = state.conf;
-    mgr::detach_tenant(
-        conf,
-        tenant_id,
-        detach_ignored.unwrap_or(false),
-        &state.deletion_queue_client,
-    )
-    .instrument(info_span!("tenant_detach", %tenant_id))
-    .await?;
+    mgr::detach_tenant(conf, tenant_id, detach_ignored.unwrap_or(false))
+        .instrument(info_span!("tenant_detach", %tenant_id))
+        .await?;

    json_response(StatusCode::OK, ())
 }
@@ -736,7 +636,7 @@ async fn tenant_list_handler(
        .instrument(info_span!("tenant_list"))
        .await
        .map_err(|_| {
-            ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
+            ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".to_string())
        })?
        .iter()
        .map(|(id, state)| TenantInfo {
@@ -758,7 +658,7 @@ async fn tenant_status(
    check_permission(&request, Some(tenant_id))?;

    let tenant_info = async {
-        let tenant = mgr::get_tenant(tenant_id, false)?;
+        let tenant = mgr::get_tenant(tenant_id, false).await?;

        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
@@ -821,7 +721,7 @@ async fn tenant_size_handler(
    let headers = request.headers();

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let tenant = mgr::get_tenant(tenant_id, true)?;
+    let tenant = mgr::get_tenant(tenant_id, true).await?;

    // this can be long operation
    let inputs = tenant
@@ -834,10 +734,6 @@ async fn tenant_size_handler(
        .map_err(ApiError::InternalServerError)?;

    let mut sizes = None;
-    let accepts_html = headers
-        .get(header::ACCEPT)
-        .map(|v| v == "text/html")
-        .unwrap_or_default();
    if !inputs_only.unwrap_or(false) {
        let storage_model = inputs
            .calculate_model()
@@ -845,11 +741,11 @@ async fn tenant_size_handler(
        let size = storage_model.calculate();

        // If request header expects html, return html
-        if accepts_html {
+        if headers["Accept"] == "text/html" {
            return synthetic_size_html_response(inputs, storage_model, size);
        }
        sizes = Some(size);
-    } else if accepts_html {
+    } else if headers["Accept"] == "text/html" {
        return Err(ApiError::BadRequest(anyhow!(
            "inputs_only parameter is incompatible with html output request"
        )));
@@ -1000,7 +896,7 @@ fn synthetic_size_html_response(
 pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>, ApiError> {
    let response = Response::builder()
        .status(status)
-        .header(header::CONTENT_TYPE, "text/html")
+        .header(hyper::header::CONTENT_TYPE, "text/html")
        .body(Body::from(data.as_bytes().to_vec()))
        .map_err(|e| ApiError::InternalServerError(e.into()))?;
    Ok(response)
@@ -1080,7 +976,7 @@ async fn get_tenant_config_handler(
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let tenant = mgr::get_tenant(tenant_id, false)?;
+    let tenant = mgr::get_tenant(tenant_id, false).await?;

    let response = HashMap::from([
        (
@@ -1124,9 +1020,6 @@ async fn put_tenant_location_config_handler(
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
-
-    let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis);
-
    let tenant_id = request_data.tenant_id;
    check_permission(&request, Some(tenant_id))?;

@@ -1137,31 +1030,29 @@ async fn put_tenant_location_config_handler(
    // The `Detached` state is special, it doesn't upsert a tenant, it removes
    // its local disk content and drops it from memory.
    if let LocationConfigMode::Detached = request_data.config.mode {
-        if let Err(e) = mgr::detach_tenant(conf, tenant_id, true, &state.deletion_queue_client)
+        mgr::detach_tenant(conf, tenant_id, true)
            .instrument(info_span!("tenant_detach", %tenant_id))
-            .await
-        {
-            match e {
-                TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
-                    // This API is idempotent: a NotFound on a detach is fine.
-                }
-                _ => return Err(e.into()),
-            }
-        }
+            .await?;
        return json_response(StatusCode::OK, ());
    }

    let location_conf =
        LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

-    state
-        .tenant_manager
-        .upsert_location(tenant_id, location_conf, flush, &ctx)
-        .await
-        // TODO: badrequest assumes the caller was asking for something unreasonable, but in
-        // principle we might have hit something like concurrent API calls to the same tenant,
-        // which is not a 400 but a 409.
-        .map_err(ApiError::BadRequest)?;
+    mgr::upsert_location(
+        state.conf,
+        tenant_id,
+        location_conf,
+        state.broker_client.clone(),
+        state.remote_storage.clone(),
+        state.deletion_queue_client.clone(),
+        &ctx,
+    )
+    .await
+    // TODO: badrequest assumes the caller was asking for something unreasonable, but in
+    // principle we might have hit something like concurrent API calls to the same tenant,
+    // which is not a 400 but a 409.
+    .map_err(ApiError::BadRequest)?;

    json_response(StatusCode::OK, ())
 }
@@ -1174,6 +1065,7 @@ async fn handle_tenant_break(
    let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;

    let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
+        .await
        .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;

    tenant.set_broken("broken from test".to_owned()).await;
@@ -1246,7 +1138,7 @@ async fn timeline_compact_handler(
        timeline
            .compact(&cancel, &ctx)
            .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            .map_err(ApiError::InternalServerError)?;
        json_response(StatusCode::OK, ())
    }
    .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
@@ -1271,7 +1163,7 @@ async fn timeline_checkpoint_handler(
        timeline
            .compact(&cancel, &ctx)
            .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            .map_err(ApiError::InternalServerError)?;

        json_response(StatusCode::OK, ())
    }
@@ -1344,141 +1236,11 @@ async fn deletion_queue_flush(
    }
 }

-/// Try if `GetPage@Lsn` is successful, useful for manual debugging.
-async fn getpage_at_lsn_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_id))?;
-
-    struct Key(crate::repository::Key);
-
-    impl std::str::FromStr for Key {
-        type Err = anyhow::Error;
-
-        fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
-            crate::repository::Key::from_hex(s).map(Key)
-        }
-    }
-
-    let key: Key = parse_query_param(&request, "key")?
-        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'key' query parameter")))?;
-    let lsn: Lsn = parse_query_param(&request, "lsn")?
-        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
-
-    async {
-        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
-
-        let page = timeline.get(key.0, lsn, &ctx).await?;
-
-        Result::<_, ApiError>::Ok(
-            Response::builder()
-                .status(StatusCode::OK)
-                .header(header::CONTENT_TYPE, "application/octet-stream")
-                .body(hyper::Body::from(page))
-                .unwrap(),
-        )
-    }
-    .instrument(info_span!("timeline_get", %tenant_id, %timeline_id))
-    .await
-}
-
-async fn timeline_collect_keyspace(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_id))?;
-
-    struct Partitioning {
-        keys: crate::keyspace::KeySpace,
-
-        at_lsn: Lsn,
-    }
-
-    impl serde::Serialize for Partitioning {
-        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            use serde::ser::SerializeMap;
-            let mut map = serializer.serialize_map(Some(2))?;
-            map.serialize_key("keys")?;
-            map.serialize_value(&KeySpace(&self.keys))?;
-            map.serialize_key("at_lsn")?;
-            map.serialize_value(&WithDisplay(&self.at_lsn))?;
-            map.end()
-        }
-    }
-
-    struct WithDisplay<'a, T>(&'a T);
-
-    impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
-        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            serializer.collect_str(&self.0)
-        }
-    }
-
-    struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
-
-    impl<'a> serde::Serialize for KeySpace<'a> {
-        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            use serde::ser::SerializeSeq;
-            let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?;
-            for kr in &self.0.ranges {
-                seq.serialize_element(&KeyRange(kr))?;
-            }
-            seq.end()
-        }
-    }
-
-    struct KeyRange<'a>(&'a std::ops::Range<crate::repository::Key>);
-
-    impl<'a> serde::Serialize for KeyRange<'a> {
-        fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            use serde::ser::SerializeTuple;
-            let mut t = serializer.serialize_tuple(2)?;
-            t.serialize_element(&WithDisplay(&self.0.start))?;
-            t.serialize_element(&WithDisplay(&self.0.end))?;
-            t.end()
-        }
-    }
-
-    let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;
-
-    async {
-        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
-        let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
-        let keys = timeline
-            .collect_keyspace(at_lsn, &ctx)
-            .await
-            .map_err(ApiError::InternalServerError)?;
-
-        json_response(StatusCode::OK, Partitioning { keys, at_lsn })
-    }
-    .instrument(info_span!("timeline_collect_keyspace", %tenant_id, %timeline_id))
-    .await
-}
-
 async fn active_timeline_of_active_tenant(
    tenant_id: TenantId,
    timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = mgr::get_tenant(tenant_id, true)?;
+    let tenant = mgr::get_tenant(tenant_id, true).await?;
    tenant
        .get_timeline(timeline_id, true)
        .map_err(|e| ApiError::NotFound(e.into()))
@@ -1541,18 +1303,17 @@ async fn disk_usage_eviction_run(

    let state = get_state(&r);

-    if state.remote_storage.as_ref().is_none() {
+    let Some(storage) = state.remote_storage.clone() else {
        return Err(ApiError::InternalServerError(anyhow::anyhow!(
            "remote storage not configured, cannot run eviction iteration"
        )));
-    }
+    };

-    let eviction_state = state.disk_usage_eviction_state.clone();
+    let state = state.disk_usage_eviction_state.clone();

    let cancel = CancellationToken::new();
    let child_cancel = cancel.clone();
    let _g = cancel.drop_guard();
-    let tenant_manager = state.tenant_manager.clone();

    crate::task_mgr::spawn(
        crate::task_mgr::BACKGROUND_RUNTIME.handle(),
@@ -1563,9 +1324,9 @@ async fn disk_usage_eviction_run(
        false,
        async move {
            let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
-                &eviction_state,
+                &state,
+                &storage,
                usage,
-                &tenant_manager,
                &child_cancel,
            )
            .await;
@@ -1583,36 +1344,6 @@ async fn disk_usage_eviction_run(
    json_response(StatusCode::OK, response)
 }

-async fn secondary_download_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let state = get_state(&request);
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    state
-        .secondary_controller
-        .download_tenant(tenant_id)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, ())
-}
-
-async fn secondary_upload_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let state = get_state(&request);
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    state
-        .secondary_controller
-        .upload_tenant(tenant_id)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, ())
-}
-
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(
        StatusCode::NOT_FOUND,
@@ -1807,10 +1538,6 @@ pub fn make_router(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
            |r| api_handler(r, get_lsn_by_timestamp_handler),
        )
-        .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/get_timestamp_of_lsn",
-            |r| api_handler(r, get_timestamp_of_lsn_handler),
-        )
        .put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
            api_handler(r, timeline_gc_handler)
        })
@@ -1849,16 +1576,6 @@ pub fn make_router(
        .put("/v1/deletion_queue/flush", |r| {
            api_handler(r, deletion_queue_flush)
        })
-        .post("/v1/secondary/:tenant_id/upload", |r| {
-            testing_api_handler("force heatmap upload", r, secondary_upload_handler)
-        })
-        .post("/v1/secondary/:tenant_id/download", |r| {
-            testing_api_handler(
-                "force secondary layer download",
-                r,
-                secondary_download_handler,
-            )
-        })
        .put("/v1/tenant/:tenant_id/break", |r| {
            testing_api_handler("set tenant state to broken", r, handle_tenant_break)
        })
@@ -1866,12 +1583,5 @@ pub fn make_router(
        .post("/v1/tracing/event", |r| {
            testing_api_handler("emit a tracing event", r, post_tracing_event_handler)
        })
-        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/getpage", |r| {
-            testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler)
-        })
-        .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/keyspace",
-            |r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
-        )
        .any(handler_404))
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -149,10 +149,6 @@ fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
    }
 }

-// FIXME: DO NOT ADD new query methods like this, which will have a next step of parsing timelineid
-// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
-// from the name.
-
 pub fn is_uninit_mark(path: &Utf8Path) -> bool {
    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
 }
@@ -177,9 +173,6 @@ fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
 /// delaying is needed.
 #[derive(Clone)]
 pub struct InitializationOrder {
-    /// Each initial tenant load task carries this until it is done loading timelines from remote storage
-    pub initial_tenant_load_remote: Option<utils::completion::Completion>,
-
    /// Each initial tenant load task carries this until completion.
    pub initial_tenant_load: Option<utils::completion::Completion>,

--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -314,7 +314,6 @@ static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
 #[strum(serialize_all = "kebab_case")]
 pub(crate) enum PageCacheErrorKind {
    AcquirePinnedSlotTimeout,
-    EvictIterLimit,
 }

 pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) {
@@ -691,9 +690,10 @@ impl StorageIoTime {
        .expect("failed to define a metric");
        let metrics = std::array::from_fn(|i| {
            let op = StorageIoOperation::from_repr(i).unwrap();
-            storage_io_histogram_vec
+            let metric = storage_io_histogram_vec
                .get_metric_with_label_values(&[op.as_str()])
-                .unwrap()
+                .unwrap();
+            metric
        });
        Self { metrics }
    }
@@ -966,7 +966,6 @@ pub(crate) struct DeletionQueueMetrics {
    pub(crate) keys_submitted: IntCounter,
    pub(crate) keys_dropped: IntCounter,
    pub(crate) keys_executed: IntCounter,
-    pub(crate) keys_validated: IntCounter,
    pub(crate) dropped_lsn_updates: IntCounter,
    pub(crate) unexpected_errors: IntCounter,
    pub(crate) remote_errors: IntCounterVec,
@@ -988,13 +987,7 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {

    keys_executed: register_int_counter!(
        "pageserver_deletion_queue_executed_total",
-        "Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed to completion"
-    )
-    .expect("failed to define a metric"),
-
-    keys_validated: register_int_counter!(
-        "pageserver_deletion_queue_validated_total",
-        "Number of keys validated for deletion.  Sum with pageserver_deletion_queue_dropped_total for the total number of keys that have passed through the validation stage."
+        "Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed."
    )
    .expect("failed to define a metric"),

@@ -1388,23 +1381,28 @@ impl TimelineMetrics {
        }
    }

-    pub(crate) fn record_new_file_metrics(&self, sz: u64) {
+    pub fn record_new_file_metrics(&self, sz: u64) {
        self.resident_physical_size_add(sz);
        self.num_persistent_files_created.inc_by(1);
        self.persistent_bytes_written.inc_by(sz);
    }

-    pub(crate) fn resident_physical_size_sub(&self, sz: u64) {
+    pub fn resident_physical_size_sub(&self, sz: u64) {
        self.resident_physical_size_gauge.sub(sz);
        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
    }

-    pub(crate) fn resident_physical_size_add(&self, sz: u64) {
+    pub fn resident_physical_size_add(&self, sz: u64) {
        self.resident_physical_size_gauge.add(sz);
        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
    }

-    pub(crate) fn resident_physical_size_get(&self) -> u64 {
+    pub fn resident_physical_size_set(&self, sz: u64) {
+        self.resident_physical_size_gauge.set(sz);
+        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.set(sz);
+    }
+
+    pub fn resident_physical_size_get(&self) -> u64 {
        self.resident_physical_size_gauge.get()
    }
 }
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -83,6 +83,7 @@ use std::{

 use anyhow::Context;
 use once_cell::sync::OnceCell;
+use tracing::instrument;
 use utils::{
    id::{TenantId, TimelineId},
    lsn::Lsn,
@@ -252,6 +253,9 @@ pub struct PageCache {
    next_evict_slot: AtomicUsize,

    size_metrics: &'static PageCacheSizeMetrics,
+
+    find_victim_waiters:
+        nostarve_queue::Queue<(usize, tokio::sync::RwLockWriteGuard<'static, SlotInner>)>,
 }

 struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
@@ -318,6 +322,15 @@ impl std::ops::Deref for PageWriteGuard<'_> {
    }
 }

+impl AsMut<[u8; PAGE_SZ]> for PageWriteGuard<'_> {
+    fn as_mut(&mut self) -> &mut [u8; PAGE_SZ] {
+        match &mut self.state {
+            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
+            PageWriteGuardState::Downgraded => todo!(),
+        }
+    }
+}
+
 impl<'a> PageWriteGuard<'a> {
    /// Mark that the buffer contents are now valid.
    #[must_use]
@@ -430,8 +443,9 @@ impl PageCache {
    ///
    /// Store an image of the given page in the cache.
    ///
+    // #[cfg_attr(test, instrument(skip_all, level = "trace", fields(%key, %lsn)))]
    pub async fn memorize_materialized_page(
-        &self,
+        &'static self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        key: Key,
@@ -522,8 +536,9 @@ impl PageCache {

    // Section 1.2: Public interface functions for working with immutable file pages.

+    // #[cfg_attr(test, instrument(skip_all, level = "trace", fields(?file_id, ?blkno)))]
    pub async fn read_immutable_buf(
-        &self,
+        &'static self,
        file_id: FileId,
        blkno: u32,
        ctx: &RequestContext,
@@ -629,7 +644,7 @@ impl PageCache {
    /// ```
    ///
    async fn lock_for_read(
-        &self,
+        &'static self,
        cache_key: &mut CacheKey,
        ctx: &RequestContext,
    ) -> anyhow::Result<ReadBufResult> {
@@ -851,10 +866,15 @@ impl PageCache {
    ///
    /// On return, the slot is empty and write-locked.
    async fn find_victim(
-        &self,
+        &'static self,
        _permit_witness: &PinnedSlotsPermit,
    ) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
-        let iter_limit = self.slots.len() * 10;
+        let nostarve_position = self.find_victim_waiters.begin()
+            .expect("we initialize the nostarve queue to the same size as the slots semaphore, and the caller is presenting a permit");
+
+        // let span = tracing::trace_span!("find_victim", ?nostarve_position);
+        // let _enter = span.enter();
+
        let mut iters = 0;
        loop {
            iters += 1;
@@ -866,41 +886,8 @@ impl PageCache {
                let mut inner = match slot.inner.try_write() {
                    Ok(inner) => inner,
                    Err(_err) => {
-                        if iters > iter_limit {
-                            // NB: Even with the permits, there's no hard guarantee that we will find a slot with
-                            // any particular number of iterations: other threads might race ahead and acquire and
-                            // release pins just as we're scanning the array.
-                            //
-                            // Imagine that nslots is 2, and as starting point, usage_count==1 on all
-                            // slots. There are two threads running concurrently, A and B. A has just
-                            // acquired the permit from the semaphore.
-                            //
-                            //   A: Look at slot 1. Its usage_count == 1, so decrement it to zero, and continue the search
-                            //   B: Acquire permit.
-                            //   B: Look at slot 2, decrement its usage_count to zero and continue the search
-                            //   B: Look at slot 1. Its usage_count is zero, so pin it and bump up its usage_count to 1.
-                            //   B: Release pin and permit again
-                            //   B: Acquire permit.
-                            //   B: Look at slot 2. Its usage_count is zero, so pin it and bump up its usage_count to 1.
-                            //   B: Release pin and permit again
-                            //
-                            // Now we're back in the starting situation that both slots have
-                            // usage_count 1, but A has now been through one iteration of the
-                            // find_victim() loop. This can repeat indefinitely and on each
-                            // iteration, A's iteration count increases by one.
-                            //
-                            // So, even though the semaphore for the permits is fair, the victim search
-                            // itself happens in parallel and is not fair.
-                            // Hence even with a permit, a task can theoretically be starved.
-                            // To avoid this, we'd need tokio to give priority to tasks that are holding
-                            // permits for longer.
-                            // Note that just yielding to tokio during iteration without such
-                            // priority boosting is likely counter-productive. We'd just give more opportunities
-                            // for B to bump usage count, further starving A.
-                            crate::metrics::page_cache_errors_inc(
-                                crate::metrics::PageCacheErrorKind::EvictIterLimit,
-                            );
-                            anyhow::bail!("exceeded evict iter limit");
+                        if iters > self.slots.len() * (MAX_USAGE_COUNT as usize) {
+                            unreachable!("find_victim_waiters prevents starvation");
                        }
                        continue;
                    }
@@ -911,7 +898,8 @@ impl PageCache {
                    inner.key = None;
                }
                crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
-                return Ok((slot_idx, inner));
+
+                return Ok(nostarve_position.complete_and_wait((slot_idx, inner)).await);
            }
        }
    }
@@ -955,6 +943,7 @@ impl PageCache {
            next_evict_slot: AtomicUsize::new(0),
            size_metrics,
            pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
+            find_victim_waiters: ::nostarve_queue::Queue::new(num_pages),
        }
    }
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -35,7 +35,6 @@ use std::time::Duration;
 use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::io::StreamReader;
-use tokio_util::sync::CancellationToken;
 use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
@@ -65,6 +64,69 @@ use crate::trace::Tracer;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

+fn copyin_stream<IO>(pgb: &mut PostgresBackend<IO>) -> impl Stream<Item = io::Result<Bytes>> + '_
+where
+    IO: AsyncRead + AsyncWrite + Unpin,
+{
+    async_stream::try_stream! {
+        loop {
+            let msg = tokio::select! {
+                biased;
+
+                _ = task_mgr::shutdown_watcher() => {
+                    // We were requested to shut down.
+                    let msg = "pageserver is shutting down";
+                    let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
+                    Err(QueryError::Other(anyhow::anyhow!(msg)))
+                }
+
+                msg = pgb.read_message() => { msg.map_err(QueryError::from)}
+            };
+
+            match msg {
+                Ok(Some(message)) => {
+                    let copy_data_bytes = match message {
+                        FeMessage::CopyData(bytes) => bytes,
+                        FeMessage::CopyDone => { break },
+                        FeMessage::Sync => continue,
+                        FeMessage::Terminate => {
+                            let msg = "client terminated connection with Terminate message during COPY";
+                            let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                            // error can't happen here, ErrorResponse serialization should be always ok
+                            pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
+                            Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
+                            break;
+                        }
+                        m => {
+                            let msg = format!("unexpected message {m:?}");
+                            // error can't happen here, ErrorResponse serialization should be always ok
+                            pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
+                            Err(io::Error::new(io::ErrorKind::Other, msg))?;
+                            break;
+                        }
+                    };
+
+                    yield copy_data_bytes;
+                }
+                Ok(None) => {
+                    let msg = "client closed connection during COPY";
+                    let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                    // error can't happen here, ErrorResponse serialization should be always ok
+                    pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
+                    pgb.flush().await?;
+                    Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
+                }
+                Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
+                    Err(io_error)?;
+                }
+                Err(other) => {
+                    Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
+                }
+            };
+        }
+    }
+}
+
 /// Read the end of a tar archive.
 ///
 /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
@@ -122,7 +184,6 @@ pub async fn libpq_listener_main(
    listener: TcpListener,
    auth_type: AuthType,
    listener_ctx: RequestContext,
-    cancel: CancellationToken,
 ) -> anyhow::Result<()> {
    listener.set_nonblocking(true)?;
    let tokio_listener = tokio::net::TcpListener::from_std(listener)?;
@@ -131,7 +192,7 @@ pub async fn libpq_listener_main(
    while let Some(res) = tokio::select! {
        biased;

-        _ = cancel.cancelled() => {
+        _ = task_mgr::shutdown_watcher() => {
            // We were requested to shut down.
            None
        }
@@ -223,13 +284,7 @@ async fn page_service_conn_main(
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
-    let mut conn_handler = PageServerHandler::new(
-        conf,
-        broker_client,
-        auth,
-        connection_ctx,
-        task_mgr::shutdown_token(),
-    );
+    let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;

    match pgbackend
@@ -263,10 +318,6 @@ struct PageServerHandler {
    /// For each query received over the connection,
    /// `process_query` creates a child context from this one.
    connection_ctx: RequestContext,
-
-    /// A token that should fire when the tenant transitions from
-    /// attached state, or when the pageserver is shutting down.
-    cancel: CancellationToken,
 }

 impl PageServerHandler {
@@ -275,7 +326,6 @@ impl PageServerHandler {
        broker_client: storage_broker::BrokerClientChannel,
        auth: Option<Arc<JwtAuth>>,
        connection_ctx: RequestContext,
-        cancel: CancellationToken,
    ) -> Self {
        PageServerHandler {
            _conf: conf,
@@ -283,91 +333,6 @@ impl PageServerHandler {
            auth,
            claims: None,
            connection_ctx,
-            cancel,
-        }
-    }
-
-    /// Wrap PostgresBackend::flush to respect our CancellationToken: it is important to use
-    /// this rather than naked flush() in order to shut down promptly.  Without this, we would
-    /// block shutdown of a tenant if a postgres client was failing to consume bytes we send
-    /// in the flush.
-    async fn flush_cancellable<IO>(&self, pgb: &mut PostgresBackend<IO>) -> Result<(), QueryError>
-    where
-        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
-    {
-        tokio::select!(
-            flush_r = pgb.flush() => {
-                Ok(flush_r?)
-            },
-            _ = self.cancel.cancelled() => {
-                Err(QueryError::Shutdown)
-            }
-        )
-    }
-
-    fn copyin_stream<'a, IO>(
-        &'a self,
-        pgb: &'a mut PostgresBackend<IO>,
-    ) -> impl Stream<Item = io::Result<Bytes>> + 'a
-    where
-        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
-    {
-        async_stream::try_stream! {
-            loop {
-                let msg = tokio::select! {
-                    biased;
-
-                    _ = self.cancel.cancelled() => {
-                        // We were requested to shut down.
-                        let msg = "pageserver is shutting down";
-                        let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
-                        Err(QueryError::Shutdown)
-                    }
-
-                    msg = pgb.read_message() => { msg.map_err(QueryError::from)}
-                };
-
-                match msg {
-                    Ok(Some(message)) => {
-                        let copy_data_bytes = match message {
-                            FeMessage::CopyData(bytes) => bytes,
-                            FeMessage::CopyDone => { break },
-                            FeMessage::Sync => continue,
-                            FeMessage::Terminate => {
-                                let msg = "client terminated connection with Terminate message during COPY";
-                                let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
-                                // error can't happen here, ErrorResponse serialization should be always ok
-                                pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
-                                Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
-                                break;
-                            }
-                            m => {
-                                let msg = format!("unexpected message {m:?}");
-                                // error can't happen here, ErrorResponse serialization should be always ok
-                                pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
-                                Err(io::Error::new(io::ErrorKind::Other, msg))?;
-                                break;
-                            }
-                        };
-
-                        yield copy_data_bytes;
-                    }
-                    Ok(None) => {
-                        let msg = "client closed connection during COPY";
-                        let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
-                        // error can't happen here, ErrorResponse serialization should be always ok
-                        pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
-                        self.flush_cancellable(pgb).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
-                        Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
-                    }
-                    Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
-                        Err(io_error)?;
-                    }
-                    Err(other) => {
-                        Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
-                    }
-                };
-            }
        }
    }

@@ -407,7 +372,7 @@ impl PageServerHandler {

        // switch client to COPYBOTH
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
-        self.flush_cancellable(pgb).await?;
+        pgb.flush().await?;

        let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);

@@ -415,10 +380,10 @@ impl PageServerHandler {
            let msg = tokio::select! {
                biased;

-                _ = self.cancel.cancelled() => {
+                _ = task_mgr::shutdown_watcher() => {
                    // We were requested to shut down.
                    info!("shutdown request received in page handler");
-                    return Err(QueryError::Shutdown)
+                    break;
                }

                msg = pgb.read_message() => { msg }
@@ -500,7 +465,7 @@ impl PageServerHandler {
            });

            pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
-            self.flush_cancellable(pgb).await?;
+            pgb.flush().await?;
        }
        Ok(())
    }
@@ -543,9 +508,9 @@ impl PageServerHandler {
        // Import basebackup provided via CopyData
        info!("importing basebackup");
        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        self.flush_cancellable(pgb).await?;
+        pgb.flush().await?;

-        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
+        let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
        timeline
            .import_basebackup_from_tar(
                &mut copyin_reader,
@@ -598,8 +563,8 @@ impl PageServerHandler {
        // Import wal provided via CopyData
        info!("importing wal");
        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        self.flush_cancellable(pgb).await?;
-        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
+        pgb.flush().await?;
+        let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
        import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
        info!("wal import complete");

@@ -807,7 +772,7 @@ impl PageServerHandler {

        // switch client to COPYOUT
        pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
-        self.flush_cancellable(pgb).await?;
+        pgb.flush().await?;

        // Send a tarball of the latest layer on the timeline. Compress if not
        // fullbackup. TODO Compress in that case too (tests need to be updated)
@@ -859,7 +824,7 @@ impl PageServerHandler {
        }

        pgb.write_message_noflush(&BeMessage::CopyDone)?;
-        self.flush_cancellable(pgb).await?;
+        pgb.flush().await?;

        let basebackup_after = started
            .elapsed()
@@ -1314,7 +1279,7 @@ async fn get_active_tenant_with_timeout(
    tenant_id: TenantId,
    _ctx: &RequestContext, /* require get a context to support cancellation in the future */
 ) -> Result<Arc<Tenant>, GetActiveTenantError> {
-    let tenant = match mgr::get_tenant(tenant_id, false) {
+    let tenant = match mgr::get_tenant(tenant_id, false).await {
        Ok(tenant) => tenant,
        Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
        Err(GetTenantError::NotActive(_)) => {
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -19,7 +19,6 @@ use postgres_ffi::BLCKSZ;
 use postgres_ffi::{Oid, TimestampTz, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
-use std::ops::ControlFlow;
 use std::ops::Range;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
@@ -371,6 +370,7 @@ impl Timeline {
        }
    }

+    ///
    /// Subroutine of find_lsn_for_timestamp(). Returns true, if there are any
    /// commits that committed after 'search_timestamp', at LSN 'probe_lsn'.
    ///
@@ -385,50 +385,6 @@ impl Timeline {
        found_larger: &mut bool,
        ctx: &RequestContext,
    ) -> Result<bool, PageReconstructError> {
-        self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
-            if timestamp >= search_timestamp {
-                *found_larger = true;
-                return ControlFlow::Break(true);
-            } else {
-                *found_smaller = true;
-            }
-            ControlFlow::Continue(())
-        })
-        .await
-    }
-
-    /// Obtain the possible timestamp range for the given lsn.
-    ///
-    /// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
-    pub async fn get_timestamp_for_lsn(
-        &self,
-        probe_lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<Option<TimestampTz>, PageReconstructError> {
-        let mut max: Option<TimestampTz> = None;
-        self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
-            if let Some(max_prev) = max {
-                max = Some(max_prev.max(timestamp));
-            } else {
-                max = Some(timestamp);
-            }
-            ControlFlow::Continue(())
-        })
-        .await?;
-
-        Ok(max)
-    }
-
-    /// Runs the given function on all the timestamps for a given lsn
-    ///
-    /// The return value is either given by the closure, or set to the `Default`
-    /// impl's output.
-    async fn map_all_timestamps<T: Default>(
-        &self,
-        probe_lsn: Lsn,
-        ctx: &RequestContext,
-        mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
-    ) -> Result<T, PageReconstructError> {
        for segno in self
            .list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
            .await?
@@ -446,14 +402,16 @@ impl Timeline {
                    timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
                    let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);

-                    match f(timestamp) {
-                        ControlFlow::Break(b) => return Ok(b),
-                        ControlFlow::Continue(()) => (),
+                    if timestamp >= search_timestamp {
+                        *found_larger = true;
+                        return Ok(true);
+                    } else {
+                        *found_smaller = true;
                    }
                }
            }
        }
-        Ok(Default::default())
+        Ok(false)
    }

    /// Get a list of SLRU segments
@@ -541,23 +499,6 @@ impl Timeline {
        self.get(CHECKPOINT_KEY, lsn, ctx).await
    }

-    pub async fn list_aux_files(
-        &self,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
-        match self.get(AUX_FILES_KEY, lsn, ctx).await {
-            Ok(buf) => match AuxFilesDirectory::des(&buf).context("deserialization failure") {
-                Ok(dir) => Ok(dir.files),
-                Err(e) => Err(PageReconstructError::from(e)),
-            },
-            Err(e) => {
-                warn!("Failed to get info about AUX files: {}", e);
-                Ok(HashMap::new())
-            }
-        }
-    }
-
    /// Does the same as get_current_logical_size but counted on demand.
    /// Used to initialize the logical size tracking on startup.
    ///
@@ -675,9 +616,7 @@ impl Timeline {

        result.add_key(CONTROLFILE_KEY);
        result.add_key(CHECKPOINT_KEY);
-        if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
-            result.add_key(AUX_FILES_KEY);
-        }
+
        Ok(result.to_keyspace())
    }

@@ -753,12 +692,6 @@ impl<'a> DatadirModification<'a> {
        })?;
        self.put(DBDIR_KEY, Value::Image(buf.into()));

-        // Create AuxFilesDirectory
-        let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
-            files: HashMap::new(),
-        })?;
-        self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
-
        let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
            xids: HashSet::new(),
        })?;
@@ -863,12 +796,6 @@ impl<'a> DatadirModification<'a> {
            // 'true', now write the updated 'dbdirs' map back.
            let buf = DbDirectory::ser(&dbdir)?;
            self.put(DBDIR_KEY, Value::Image(buf.into()));
-
-            // Create AuxFilesDirectory as well
-            let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
-                files: HashMap::new(),
-            })?;
-            self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
        }
        if r.is_none() {
            // Create RelDirectory
@@ -1193,37 +1120,6 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

-    pub async fn put_file(
-        &mut self,
-        path: &str,
-        content: &[u8],
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        let mut dir = match self.get(AUX_FILES_KEY, ctx).await {
-            Ok(buf) => AuxFilesDirectory::des(&buf)?,
-            Err(e) => {
-                // This is expected: historical databases do not have the key.
-                debug!("Failed to get info about AUX files: {}", e);
-                AuxFilesDirectory {
-                    files: HashMap::new(),
-                }
-            }
-        };
-        let path = path.to_string();
-        if content.is_empty() {
-            dir.files.remove(&path);
-        } else {
-            dir.files.insert(path, Bytes::copy_from_slice(content));
-        }
-        self.put(
-            AUX_FILES_KEY,
-            Value::Image(Bytes::from(
-                AuxFilesDirectory::ser(&dir).context("serialize")?,
-            )),
-        );
-        Ok(())
-    }
-
    ///
    /// Flush changes accumulated so far to the underlying repository.
    ///
@@ -1359,11 +1255,6 @@ struct RelDirectory {
    rels: HashSet<(Oid, u8)>,
 }

-#[derive(Debug, Serialize, Deserialize, Default)]
-struct AuxFilesDirectory {
-    files: HashMap<String, Bytes>,
-}
-
 #[derive(Debug, Serialize, Deserialize)]
 struct RelSizeEntry {
    nblocks: u32,
@@ -1412,12 +1303,10 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 // 02 pg_twophase
 //
 // 03 misc
-//    Controlfile
+//    controlfile
 //    checkpoint
 //    pg_version
 //
-// 04 aux files
-//
 // Below is a full list of the keyspace allocation:
 //
 // DbDir:
@@ -1455,11 +1344,6 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 //
 // Checkpoint:
 // 03 00000000 00000000 00000000 00   00000001
-//
-// AuxFiles:
-// 03 00000000 00000000 00000000 00   00000002
-//
-
 //-- Section 01: relation data and metadata

 const DBDIR_KEY: Key = Key {
@@ -1683,15 +1567,6 @@ const CHECKPOINT_KEY: Key = Key {
    field6: 1,
 };

-const AUX_FILES_KEY: Key = Key {
-    field1: 0x03,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 2,
-};
-
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.

--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -257,12 +257,6 @@ pub enum TaskKind {
    /// See [`crate::disk_usage_eviction_task`].
    DiskUsageEviction,

-    /// See [`crate::tenant::secondary`].
-    SecondaryDownloads,
-
-    /// See [`crate::tenant::secondary`].
-    SecondaryUploads,
-
    // Initial logical size calculation
    InitialLogicalSizeCalculation,

@@ -299,6 +293,8 @@ pub enum TaskKind {

    DebugTool,

+    BackgroundRuntimeTurnaroundMeasure,
+
    #[cfg(test)]
    UnitTest,
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -21,7 +21,7 @@ use crate::{
 };

 use super::{
-    mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
+    mgr::{GetTenantError, TenantsMap},
    remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
    span,
    timeline::delete::DeleteTimelineFlow,
@@ -31,25 +31,16 @@ use super::{
 const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;

 #[derive(Debug, thiserror::Error)]
-pub(crate) enum DeleteTenantError {
+pub enum DeleteTenantError {
    #[error("GetTenant {0}")]
    Get(#[from] GetTenantError),

-    #[error("Tenant not attached")]
-    NotAttached,
-
    #[error("Invalid state {0}. Expected Active or Broken")]
    InvalidState(TenantState),

    #[error("Tenant deletion is already in progress")]
    AlreadyInProgress,

-    #[error("Tenant map slot error {0}")]
-    SlotError(#[from] TenantSlotError),
-
-    #[error("Tenant map slot upsert error {0}")]
-    SlotUpsertError(#[from] TenantSlotUpsertError),
-
    #[error("Timeline {0}")]
    Timeline(#[from] DeleteTimelineError),

@@ -310,12 +301,12 @@ impl DeleteTenantFlow {
    pub(crate) async fn run(
        conf: &'static PageServerConf,
        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
-        tenant: Arc<Tenant>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenant_id: TenantId,
    ) -> Result<(), DeleteTenantError> {
        span::debug_assert_current_span_has_tenant_id();

-        let mut guard = Self::prepare(&tenant).await?;
+        let (tenant, mut guard) = Self::prepare(tenants, tenant_id).await?;

        if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
            tenant.set_broken(format!("{e:#}")).await;
@@ -385,7 +376,7 @@ impl DeleteTenantFlow {
        Ok(())
    }

-    pub(crate) async fn should_resume_deletion(
+    pub async fn should_resume_deletion(
        conf: &'static PageServerConf,
        remote_storage: Option<&GenericRemoteStorage>,
        tenant: &Tenant,
@@ -420,7 +411,7 @@ impl DeleteTenantFlow {
        guard: DeletionGuard,
        tenant: &Arc<Tenant>,
        init_order: Option<&InitializationOrder>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        ctx: &RequestContext,
    ) -> Result<(), DeleteTenantError> {
        let (_, progress) = completion::channel();
@@ -441,7 +432,7 @@ impl DeleteTenantFlow {
        // Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir)
        let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id);
        if timelines_path.exists() {
-            tenant.load(init_order, None, ctx).await.context("load")?;
+            tenant.load(init_order, ctx).await.context("load")?;
        }

        Self::background(
@@ -457,7 +448,7 @@ impl DeleteTenantFlow {
    pub(crate) async fn resume_from_attach(
        guard: DeletionGuard,
        tenant: &Arc<Tenant>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        ctx: &RequestContext,
    ) -> Result<(), DeleteTenantError> {
        let (_, progress) = completion::channel();
@@ -467,10 +458,7 @@ impl DeleteTenantFlow {
            .await
            .expect("cant be stopping or broken");

-        tenant
-            .attach(ctx, super::AttachMarkerMode::Expect)
-            .await
-            .context("attach")?;
+        tenant.attach(ctx).await.context("attach")?;

        Self::background(
            guard,
@@ -483,8 +471,15 @@ impl DeleteTenantFlow {
    }

    async fn prepare(
-        tenant: &Arc<Tenant>,
-    ) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
+        tenants: &tokio::sync::RwLock<TenantsMap>,
+        tenant_id: TenantId,
+    ) -> Result<(Arc<Tenant>, tokio::sync::OwnedMutexGuard<Self>), DeleteTenantError> {
+        let m = tenants.read().await;
+
+        let tenant = m
+            .get(&tenant_id)
+            .ok_or(GetTenantError::NotFound(tenant_id))?;
+
        // FIXME: unsure about active only. Our init jobs may not be cancellable properly,
        // so at least for now allow deletions only for active tenants. TODO recheck
        // Broken and Stopping is needed for retries.
@@ -518,14 +513,14 @@ impl DeleteTenantFlow {
            )));
        }

-        Ok(guard)
+        Ok((Arc::clone(tenant), guard))
    }

    fn schedule_background(
        guard: OwnedMutexGuard<Self>,
        conf: &'static PageServerConf,
        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        tenant: Arc<Tenant>,
    ) {
        let tenant_id = tenant.tenant_id;
@@ -558,7 +553,7 @@ impl DeleteTenantFlow {
        mut guard: OwnedMutexGuard<Self>,
        conf: &PageServerConf,
        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        tenant: &Arc<Tenant>,
    ) -> Result<(), DeleteTenantError> {
        // Tree sort timelines, schedule delete for them. Mention retries from the console side.
@@ -606,7 +601,7 @@ impl DeleteTenantFlow {
            .await
            .context("cleanup_remaining_fs_traces")?;

-        let mut locked = tenants.write().unwrap();
+        let mut locked = tenants.write().await;
        if locked.remove(&tenant.tenant_id).is_none() {
            warn!("Tenant got removed from tenants map during deletion");
        };
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -354,7 +354,8 @@ mod tests {
        }

        // Test a large blob that spans multiple pages
-        let mut large_data = vec![0; 20000];
+        let mut large_data = Vec::new();
+        large_data.resize(20000, 0);
        thread_rng().fill_bytes(&mut large_data);
        let pos_large = file.write_blob(&large_data, &ctx).await?;
        let result = file.block_cursor().read_blob(pos_large, &ctx).await?;
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -639,10 +639,147 @@ impl LayerMap {
        }

        println!("historic_layers:");
-        for desc in self.iter_historic_layers() {
-            desc.dump();
+        for layer in self.iter_historic_layers() {
+            layer.dump(verbose, ctx)?;
        }
        println!("End dump LayerMap");
        Ok(())
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::LayerMap;
+    use crate::tenant::storage_layer::LayerFileName;
+    use std::str::FromStr;
+    use std::sync::Arc;
+
+    mod l0_delta_layers_updated {
+
+        use crate::tenant::{
+            storage_layer::{AsLayerDesc, PersistentLayerDesc},
+            timeline::layer_manager::LayerFileManager,
+        };
+
+        use super::*;
+
+        struct LayerObject(PersistentLayerDesc);
+
+        impl AsLayerDesc for LayerObject {
+            fn layer_desc(&self) -> &PersistentLayerDesc {
+                &self.0
+            }
+        }
+
+        impl LayerObject {
+            fn new(desc: PersistentLayerDesc) -> Self {
+                LayerObject(desc)
+            }
+        }
+
+        type TestLayerFileManager = LayerFileManager<LayerObject>;
+
+        #[test]
+        fn for_full_range_delta() {
+            // l0_delta_layers are used by compaction, and should observe all buffered updates
+            l0_delta_layers_updated_scenario(
+                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
+                 true
+             )
+        }
+
+        #[test]
+        fn for_non_full_range_delta() {
+            // has minimal uncovered areas compared to l0_delta_layers_updated_on_insert_replace_remove_for_full_range_delta
+            l0_delta_layers_updated_scenario(
+                 "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
+                 // because not full range
+                 false
+             )
+        }
+
+        #[test]
+        fn for_image() {
+            l0_delta_layers_updated_scenario(
+                 "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
+                 // code only checks if it is a full range layer, doesn't care about images, which must
+                 // mean we should in practice never have full range images
+                 false
+             )
+        }
+
+        #[test]
+        fn replacing_missing_l0_is_notfound() {
+            // original impl had an oversight, and L0 was an anyhow::Error. anyhow::Error should
+            // however only happen for precondition failures.
+
+            let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
+            let layer = LayerFileName::from_str(layer).unwrap();
+            let layer = PersistentLayerDesc::from(layer);
+
+            // same skeletan construction; see scenario below
+            let not_found = Arc::new(LayerObject::new(layer.clone()));
+            let new_version = Arc::new(LayerObject::new(layer));
+
+            // after the immutable storage state refactor, the replace operation
+            // will not use layer map any more. We keep it here for consistency in test cases
+            // and can remove it in the future.
+            let _map = LayerMap::default();
+
+            let mut mapping = TestLayerFileManager::new();
+
+            mapping
+                .replace_and_verify(not_found, new_version)
+                .unwrap_err();
+        }
+
+        fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
+            let name = LayerFileName::from_str(layer_name).unwrap();
+            let skeleton = PersistentLayerDesc::from(name);
+
+            let remote = Arc::new(LayerObject::new(skeleton.clone()));
+            let downloaded = Arc::new(LayerObject::new(skeleton));
+
+            let mut map = LayerMap::default();
+            let mut mapping = LayerFileManager::new();
+
+            // two disjoint Arcs in different lifecycle phases. even if it seems they must be the
+            // same layer, we use LayerMap::compare_arced_layers as the identity of layers.
+            assert_eq!(remote.layer_desc(), downloaded.layer_desc());
+
+            let expected_in_counts = (1, usize::from(expected_l0));
+
+            map.batch_update()
+                .insert_historic(remote.layer_desc().clone());
+            mapping.insert(remote.clone());
+            assert_eq!(
+                count_layer_in(&map, remote.layer_desc()),
+                expected_in_counts
+            );
+
+            mapping
+                .replace_and_verify(remote, downloaded.clone())
+                .expect("name derived attributes are the same");
+            assert_eq!(
+                count_layer_in(&map, downloaded.layer_desc()),
+                expected_in_counts
+            );
+
+            map.batch_update().remove_historic(downloaded.layer_desc());
+            assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
+        }
+
+        fn count_layer_in(map: &LayerMap, layer: &PersistentLayerDesc) -> (usize, usize) {
+            let historic = map
+                .iter_historic_layers()
+                .filter(|x| x.key() == layer.key())
+                .count();
+            let l0s = map
+                .get_level0_deltas()
+                .expect("why does this return a result");
+            let l0 = l0s.iter().filter(|x| x.key() == layer.key()).count();
+
+            (historic, l0)
+        }
+    }
+}
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
--- a/pageserver/src/tenant/par_fsync.rs
+++ b/pageserver/src/tenant/par_fsync.rs
@@ -57,7 +57,8 @@ pub fn par_fsync(paths: &[Utf8PathBuf]) -> io::Result<()> {
    fsync_in_thread_pool(paths)
 }

-/// Parallel fsync asynchronously.
+/// Parallel fsync asynchronously. If number of files are less than PARALLEL_PATH_THRESHOLD, fsync is done in the current
+/// execution thread. Otherwise, we will spawn_blocking and run it in tokio.
 pub async fn par_fsync_async(paths: &[Utf8PathBuf]) -> io::Result<()> {
    const MAX_CONCURRENT_FSYNC: usize = 64;
    let mut next = paths.iter().peekable();
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -167,6 +167,8 @@
 //!   - download their remote [`IndexPart`]s
 //!   - create `Timeline` struct and a `RemoteTimelineClient`
 //!   - initialize the client's upload queue with its `IndexPart`
+//!   - create [`RemoteLayer`](super::storage_layer::RemoteLayer) instances
+//!     for layers that are referenced by `IndexPart` but not present locally
 //!   - schedule uploads for layers that are only present locally.
 //!   - if the remote `IndexPart`'s metadata was newer than the metadata in
 //!     the local filesystem, write the remote metadata to the local filesystem
@@ -202,14 +204,15 @@
 //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
 //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map

-pub(crate) mod download;
+mod download;
 pub mod index;
 mod upload;

 use anyhow::Context;
 use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};
-
+// re-export these
+pub use download::{is_temp_download_file, list_remote_timelines};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use utils::backoff::{
@@ -234,7 +237,7 @@ use crate::metrics::{
 };
 use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::storage_layer::AsLayerDesc;
+use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::tenant::upload_queue::Delete;
 use crate::tenant::TIMELINES_SEGMENT_NAME;
 use crate::{
@@ -252,13 +255,10 @@ use utils::id::{TenantId, TimelineId};

 use self::index::IndexPart;

-use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
+use super::storage_layer::LayerFileName;
 use super::upload_queue::SetDeletedFlagProgress;
 use super::Generation;

-pub(crate) use download::{is_temp_download_file, list_remote_timelines};
-pub(crate) use index::LayerFileMetadata;
-
 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a download fails, we log it at info-level, and retry.
 // But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
@@ -627,181 +627,101 @@ impl RemoteTimelineClient {
    ///
    /// Launch an upload operation in the background.
    ///
-    pub(crate) fn schedule_layer_file_upload(
+    pub fn schedule_layer_file_upload(
        self: &Arc<Self>,
-        layer: ResidentLayer,
+        layer_file_name: &LayerFileName,
+        layer_metadata: &LayerFileMetadata,
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

-        self.schedule_layer_file_upload0(upload_queue, layer);
-        self.launch_queued_tasks(upload_queue);
-        Ok(())
-    }
-
-    fn schedule_layer_file_upload0(
-        self: &Arc<Self>,
-        upload_queue: &mut UploadQueueInitialized,
-        layer: ResidentLayer,
-    ) {
-        let metadata = layer.metadata();
-
        upload_queue
            .latest_files
-            .insert(layer.layer_desc().filename(), metadata.clone());
+            .insert(layer_file_name.clone(), layer_metadata.clone());
        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;

-        info!("scheduled layer file upload {layer}");
-        let op = UploadOp::UploadLayer(layer, metadata);
+        let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
        self.calls_unfinished_metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
+
+        info!("scheduled layer file upload {layer_file_name}");
+
+        // Launch the task immediately, if possible
+        self.launch_queued_tasks(upload_queue);
+        Ok(())
    }

    /// Launch a delete operation in the background.
    ///
-    /// The operation does not modify local filesystem state.
+    /// The operation does not modify local state but assumes the local files have already been
+    /// deleted, and is used to mirror those changes to remote.
    ///
    /// Note: This schedules an index file upload before the deletions.  The
-    /// deletion won't actually be performed, until all previously scheduled
+    /// deletion won't actually be performed, until any previously scheduled
    /// upload operations, and the index file upload, have completed
    /// successfully.
    pub fn schedule_layer_file_deletion(
        self: &Arc<Self>,
-        names: &[LayerFileName],
+        names: Vec<LayerFileName>,
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

-        let with_generations =
-            self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());
-
-        self.schedule_deletion_of_unlinked0(upload_queue, with_generations);
-
-        // Launch the tasks immediately, if possible
-        self.launch_queued_tasks(upload_queue);
-        Ok(())
-    }
-
-    /// Unlinks the layer files from `index_part.json` but does not yet schedule deletion for the
-    /// layer files, leaving them dangling.
-    ///
-    /// The files will be leaked in remote storage unless [`Self::schedule_deletion_of_unlinked`]
-    /// is invoked on them.
-    pub(crate) fn schedule_gc_update(self: &Arc<Self>, gc_layers: &[Layer]) -> anyhow::Result<()> {
-        let mut guard = self.upload_queue.lock().unwrap();
-        let upload_queue = guard.initialized_mut()?;
-
-        // just forget the return value; after uploading the next index_part.json, we can consider
-        // the layer files as "dangling". this is fine, at worst case we create work for the
-        // scrubber.
-
-        let names = gc_layers.iter().map(|x| x.layer_desc().filename());
-
-        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
-
-        self.launch_queued_tasks(upload_queue);
-
-        Ok(())
-    }
-
-    /// Update the remote index file, removing the to-be-deleted files from the index,
-    /// allowing scheduling of actual deletions later.
-    fn schedule_unlinking_of_layers_from_index_part0<I>(
-        self: &Arc<Self>,
-        upload_queue: &mut UploadQueueInitialized,
-        names: I,
-    ) -> Vec<(LayerFileName, Generation)>
-    where
-        I: IntoIterator<Item = LayerFileName>,
-    {
        // Deleting layers doesn't affect the values stored in TimelineMetadata,
        // so we don't need update it. Just serialize it.
        let metadata = upload_queue.latest_metadata.clone();

-        // Decorate our list of names with each name's generation, dropping
-        // names that are unexpectedly missing from our metadata.
-        let with_generations: Vec<_> = names
-            .into_iter()
-            .filter_map(|name| {
-                let meta = upload_queue.latest_files.remove(&name);
+        // Update the remote index file, removing the to-be-deleted files from the index,
+        // before deleting the actual files.
+        //
+        // Once we start removing files from upload_queue.latest_files, there's
+        // no going back! Otherwise, some of the files would already be removed
+        // from latest_files, but not yet scheduled for deletion. Use a closure
+        // to syntactically forbid ? or bail! calls here.
+        let no_bail_here = || {
+            // Decorate our list of names with each name's generation, dropping
+            // makes that are unexpectedly missing from our metadata.
+            let with_generations: Vec<_> = names
+                .into_iter()
+                .filter_map(|name| {
+                    // Remove from latest_files, learning the file's remote generation in the process
+                    let meta = upload_queue.latest_files.remove(&name);

-                if let Some(meta) = meta {
-                    upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
-                    Some((name, meta.generation))
-                } else {
-                    // This can only happen if we forgot to to schedule the file upload
-                    // before scheduling the delete. Log it because it is a rare/strange
-                    // situation, and in case something is misbehaving, we'd like to know which
-                    // layers experienced this.
-                    info!("Deleting layer {name} not found in latest_files list, never uploaded?");
-                    None
-                }
-            })
-            .collect();
+                    if let Some(meta) = meta {
+                        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
+                        Some((name, meta.generation))
+                    } else {
+                        // This can only happen if we forgot to to schedule the file upload
+                        // before scheduling the delete. Log it because it is a rare/strange
+                        // situation, and in case something is misbehaving, we'd like to know which
+                        // layers experienced this.
+                        info!(
+                            "Deleting layer {name} not found in latest_files list, never uploaded?"
+                        );
+                        None
+                    }
+                })
+                .collect();

-        // after unlinking files from the upload_queue.latest_files we must always schedule an
-        // index_part update, because that needs to be uploaded before we can actually delete the
-        // files.
-        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            self.schedule_index_upload(upload_queue, metadata);
-        }
+            if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
+                self.schedule_index_upload(upload_queue, metadata);
+            }

-        with_generations
-    }
+            for (name, gen) in &with_generations {
+                info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
+            }

-    /// Schedules deletion for layer files which have previously been unlinked from the
-    /// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`].
-    pub(crate) fn schedule_deletion_of_unlinked(
-        self: &Arc<Self>,
-        layers: Vec<(LayerFileName, Generation)>,
-    ) -> anyhow::Result<()> {
-        let mut guard = self.upload_queue.lock().unwrap();
-        let upload_queue = guard.initialized_mut()?;
-
-        self.schedule_deletion_of_unlinked0(upload_queue, layers);
-        self.launch_queued_tasks(upload_queue);
-        Ok(())
-    }
-
-    fn schedule_deletion_of_unlinked0(
-        self: &Arc<Self>,
-        upload_queue: &mut UploadQueueInitialized,
-        with_generations: Vec<(LayerFileName, Generation)>,
-    ) {
-        for (name, gen) in &with_generations {
-            info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
-        }
-
-        // schedule the actual deletions
-        let op = UploadOp::Delete(Delete {
-            layers: with_generations,
-        });
-        self.calls_unfinished_metric_begin(&op);
-        upload_queue.queued_operations.push_back(op);
-    }
-
-    /// Schedules a compaction update to the remote `index_part.json`.
-    ///
-    /// `compacted_from` represent the L0 names which have been `compacted_to` L1 layers.
-    pub(crate) fn schedule_compaction_update(
-        self: &Arc<Self>,
-        compacted_from: &[Layer],
-        compacted_to: &[ResidentLayer],
-    ) -> anyhow::Result<()> {
-        let mut guard = self.upload_queue.lock().unwrap();
-        let upload_queue = guard.initialized_mut()?;
-
-        for layer in compacted_to {
-            self.schedule_layer_file_upload0(upload_queue, layer.clone());
-        }
-
-        let names = compacted_from.iter().map(|x| x.layer_desc().filename());
-
-        let with_generations =
-            self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
-        self.schedule_deletion_of_unlinked0(upload_queue, with_generations);
-        self.launch_queued_tasks(upload_queue);
+            // schedule the actual deletions
+            let op = UploadOp::Delete(Delete {
+                layers: with_generations,
+            });
+            self.calls_unfinished_metric_begin(&op);
+            upload_queue.queued_operations.push_back(op);

+            // Launch the tasks immediately, if possible
+            self.launch_queued_tasks(upload_queue);
+        };
+        no_bail_here();
        Ok(())
    }

@@ -981,27 +901,9 @@ impl RemoteTimelineClient {
        .await
        .context("list prefixes")?;

-        // We will delete the current index_part object last, since it acts as a deletion
-        // marker via its deleted_at attribute
-        let latest_index = remaining
-            .iter()
-            .filter(|p| {
-                p.object_name()
-                    .map(|n| n.starts_with(IndexPart::FILE_NAME))
-                    .unwrap_or(false)
-            })
-            .filter_map(|path| parse_remote_index_path(path.clone()).map(|gen| (path, gen)))
-            .max_by_key(|i| i.1)
-            .map(|i| i.0.clone())
-            .unwrap_or(
-                // No generation-suffixed indices, assume we are dealing with
-                // a legacy index.
-                remote_index_path(&self.tenant_id, &self.timeline_id, Generation::none()),
-            );
-
-        let remaining_layers: Vec<RemotePath> = remaining
+        let remaining: Vec<RemotePath> = remaining
            .into_iter()
-            .filter(|p| p!= &latest_index)
+            .filter(|p| p.object_name() != Some(IndexPart::FILE_NAME))
            .inspect(|path| {
                if let Some(name) = path.object_name() {
                    info!(%name, "deleting a file not referenced from index_part.json");
@@ -1011,11 +913,9 @@ impl RemoteTimelineClient {
            })
            .collect();

-        let not_referenced_count = remaining_layers.len();
-        if !remaining_layers.is_empty() {
-            self.deletion_queue_client
-                .push_immediate(remaining_layers)
-                .await?;
+        let not_referenced_count = remaining.len();
+        if !remaining.is_empty() {
+            self.deletion_queue_client.push_immediate(remaining).await?;
        }

        fail::fail_point!("timeline-delete-before-index-delete", |_| {
@@ -1024,9 +924,11 @@ impl RemoteTimelineClient {
            ))?
        });

+        let index_file_path = timeline_storage_path.join(Utf8Path::new(IndexPart::FILE_NAME));
+
        debug!("enqueuing index part deletion");
        self.deletion_queue_client
-            .push_immediate([latest_index].to_vec())
+            .push_immediate([index_file_path].to_vec())
            .await?;

        // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
@@ -1173,12 +1075,16 @@ impl RemoteTimelineClient {
            }

            let upload_result: anyhow::Result<()> = match &task.op {
-                UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
-                    let path = layer.local_path();
+                UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => {
+                    let path = self
+                        .conf
+                        .timeline_path(&self.tenant_id, &self.timeline_id)
+                        .join(layer_file_name.file_name());
+
                    upload::upload_timeline_layer(
                        self.conf,
                        &self.storage_impl,
-                        path,
+                        &path,
                        layer_metadata,
                        self.generation,
                    )
@@ -1495,23 +1401,6 @@ impl RemoteTimelineClient {
            }
        }
    }
-
-    pub(crate) fn get_layers_metadata(
-        &self,
-        layers: Vec<LayerFileName>,
-    ) -> anyhow::Result<Vec<Option<LayerFileMetadata>>> {
-        let q = self.upload_queue.lock().unwrap();
-        let q = match &*q {
-            UploadQueue::Stopped(_) | UploadQueue::Uninitialized => {
-                anyhow::bail!("queue is in state {}", q.as_str())
-            }
-            UploadQueue::Initialized(inner) => inner,
-        };
-
-        let decorated = layers.into_iter().map(|l| q.latest_files.get(&l).cloned());
-
-        Ok(decorated.collect())
-    }
 }

 pub fn remote_timelines_path(tenant_id: &TenantId) -> RemotePath {
@@ -1552,13 +1441,6 @@ pub fn remote_index_path(
    .expect("Failed to construct path")
 }

-pub const HEATMAP_BASENAME: &str = "heatmap";
-
-pub fn remote_heatmap_path(tenant_id: &TenantId) -> RemotePath {
-    RemotePath::from_string(&format!("tenants/{tenant_id}/{HEATMAP_BASENAME}-v01"))
-        .expect("Failed to construct path")
-}
-
 /// Given the key of an index, parse out the generation part of the name
 pub(crate) fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
    let file_name = match path.get_path().file_name() {
@@ -1606,7 +1488,6 @@ mod tests {
        context::RequestContext,
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::Layer,
            Generation, Tenant, Timeline,
        },
        DEFAULT_PG_VERSION,
@@ -1775,29 +1656,32 @@ mod tests {
        let generation = harness.generation;

        // Create a couple of dummy files,  schedule upload for them
+        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
+        let layer_file_name_2: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap();
+        let layer_file_name_3: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap();
+        let content_1 = dummy_contents("foo");
+        let content_2 = dummy_contents("bar");
+        let content_3 = dummy_contents("baz");

-        let layers = [
-            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), dummy_contents("foo")),
-            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap(), dummy_contents("bar")),
-            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz"))
-        ]
-        .into_iter()
-        .map(|(name, contents): (LayerFileName, Vec<u8>)| {
-            std::fs::write(timeline_path.join(name.file_name()), &contents).unwrap();
-
-            Layer::for_resident(
-                harness.conf,
-                &timeline,
-                name,
-                LayerFileMetadata::new(contents.len() as u64, generation),
-            )
-        }).collect::<Vec<_>>();
+        for (filename, content) in [
+            (&layer_file_name_1, &content_1),
+            (&layer_file_name_2, &content_2),
+            (&layer_file_name_3, &content_3),
+        ] {
+            std::fs::write(timeline_path.join(filename.file_name()), content).unwrap();
+        }

        client
-            .schedule_layer_file_upload(layers[0].clone())
+            .schedule_layer_file_upload(
+                &layer_file_name_1,
+                &LayerFileMetadata::new(content_1.len() as u64, generation),
+            )
            .unwrap();
        client
-            .schedule_layer_file_upload(layers[1].clone())
+            .schedule_layer_file_upload(
+                &layer_file_name_2,
+                &LayerFileMetadata::new(content_2.len() as u64, generation),
+            )
            .unwrap();

        // Check that they are started immediately, not queued
@@ -1851,42 +1735,38 @@ mod tests {
                .collect(),
            &[
                &initial_layer.file_name(),
-                &layers[0].layer_desc().filename().file_name(),
-                &layers[1].layer_desc().filename().file_name(),
+                &layer_file_name_1.file_name(),
+                &layer_file_name_2.file_name(),
            ],
        );
        assert_eq!(index_part.metadata, metadata);

        // Schedule upload and then a deletion. Check that the deletion is queued
        client
-            .schedule_layer_file_upload(layers[2].clone())
+            .schedule_layer_file_upload(
+                &layer_file_name_3,
+                &LayerFileMetadata::new(content_3.len() as u64, generation),
+            )
            .unwrap();
-
-        // this is no longer consistent with how deletion works with Layer::drop, but in this test
-        // keep using schedule_layer_file_deletion because we don't have a way to wait for the
-        // spawn_blocking started by the drop.
        client
-            .schedule_layer_file_deletion(&[layers[0].layer_desc().filename()])
+            .schedule_layer_file_deletion([layer_file_name_1.clone()].to_vec())
            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();

            // Deletion schedules upload of the index file, and the file deletion itself
-            assert_eq!(upload_queue.queued_operations.len(), 2);
-            assert_eq!(upload_queue.inprogress_tasks.len(), 1);
-            assert_eq!(upload_queue.num_inprogress_layer_uploads, 1);
-            assert_eq!(upload_queue.num_inprogress_deletions, 0);
-            assert_eq!(
-                upload_queue.latest_files_changes_since_metadata_upload_scheduled,
-                0
-            );
+            assert!(upload_queue.queued_operations.len() == 2);
+            assert!(upload_queue.inprogress_tasks.len() == 1);
+            assert!(upload_queue.num_inprogress_layer_uploads == 1);
+            assert!(upload_queue.num_inprogress_deletions == 0);
+            assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
        }
        assert_remote_files(
            &[
                &initial_layer.file_name(),
-                &layers[0].layer_desc().filename().file_name(),
-                &layers[1].layer_desc().filename().file_name(),
+                &layer_file_name_1.file_name(),
+                &layer_file_name_2.file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
@@ -1900,8 +1780,8 @@ mod tests {
        assert_remote_files(
            &[
                &initial_layer.file_name(),
-                &layers[1].layer_desc().filename().file_name(),
-                &layers[2].layer_desc().filename().file_name(),
+                &layer_file_name_2.file_name(),
+                &layer_file_name_3.file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
@@ -1930,13 +1810,6 @@ mod tests {
        )
        .unwrap();

-        let layer_file_1 = Layer::for_resident(
-            harness.conf,
-            &timeline,
-            layer_file_name_1.clone(),
-            LayerFileMetadata::new(content_1.len() as u64, harness.generation),
-        );
-
        #[derive(Debug, PartialEq, Clone, Copy)]
        struct BytesStartedFinished {
            started: Option<usize>,
@@ -1972,7 +1845,10 @@ mod tests {
        let actual_a = get_bytes_started_stopped();

        client
-            .schedule_layer_file_upload(layer_file_1.clone())
+            .schedule_layer_file_upload(
+                &layer_file_name_1,
+                &LayerFileMetadata::new(content_1.len() as u64, harness.generation),
+            )
            .unwrap();

        let actual_b = get_bytes_started_stopped();
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -18,7 +18,7 @@ use crate::config::PageServerConf;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::{Generation, TENANT_DELETED_MARKER_FILE_NAME};
+use crate::tenant::Generation;
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};
@@ -190,12 +190,6 @@ pub async fn list_remote_timelines(
    let mut timeline_ids = HashSet::new();

    for timeline_remote_storage_key in timelines {
-        if timeline_remote_storage_key.object_name() == Some(TENANT_DELETED_MARKER_FILE_NAME) {
-            // A `deleted` key within `timelines/` is a marker file, not a timeline.  Ignore it.
-            // This code will be removed in https://github.com/neondatabase/neon/pull/5580
-            continue;
-        }
-
        let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
            anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
        })?;
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -31,7 +31,6 @@ pub(super) async fn upload_index_part<'a>(
    fail_point!("before-upload-index", |_| {
        bail!("failpoint before-upload-index")
    });
-    pausable_failpoint!("before-upload-index-pausable");

    let index_part_bytes =
        serde_json::to_vec(&index_part).context("serialize index part file into bytes")?;
@@ -60,8 +59,6 @@ pub(super) async fn upload_timeline_layer<'a>(
        bail!("failpoint before-upload-layer")
    });

-    pausable_failpoint!("before-upload-layer-pausable");
-
    let storage_path = remote_path(conf, source_path, generation)?;
    let source_file_res = fs::File::open(&source_path).await;
    let source_file = match source_file_res {
@@ -72,8 +69,6 @@ pub(super) async fn upload_timeline_layer<'a>(
            // upload. However, a nonexistent file can also be indicative of
            // something worse, like when a file is scheduled for upload before
            // it has been written to disk yet.
-            //
-            // This is tested against `test_compaction_delete_before_upload`
            info!(path = %source_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
            return Ok(());
        }
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -1,268 +0,0 @@
-pub mod downloader;
-pub mod heatmap;
-pub mod heatmap_writer;
-
-use std::{sync::Arc, time::SystemTime};
-
-use crate::{
-    config::PageServerConf,
-    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
-};
-
-use self::{
-    downloader::{downloader_task, SecondaryDetail},
-    heatmap_writer::heatmap_writer_task,
-};
-
-use super::{
-    mgr::TenantManager,
-    storage_layer::{AsLayerDesc, Layer},
-    timeline::DiskUsageEvictionInfo,
-};
-
-use remote_storage::GenericRemoteStorage;
-
-use tokio_util::sync::CancellationToken;
-use utils::{
-    completion::Barrier,
-    fs_ext,
-    id::{TenantId, TimelineId},
-};
-
-enum DownloadCommand {
-    Download(TenantId),
-}
-enum UploadCommand {
-    Upload(TenantId),
-}
-
-struct CommandRequest<T> {
-    payload: T,
-    response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
-}
-
-struct CommandResponse {
-    result: anyhow::Result<()>,
-}
-
-// Whereas [`Tenant`] represents an attached tenant, this type represents the work
-// we do for secondary tenant locations: where we are not serving clients or
-// ingesting WAL, but we are maintaining a warm cache of layer files.
-//
-// This type is all about the _download_ path for secondary mode.  The upload path
-// runs while a regular attached `Tenant` exists.
-//
-// This structure coordinates TenantManager and SecondaryDownloader,
-// so that the downloader can indicate which tenants it is currently
-// operating on, and the manager can indicate when a particular
-// secondary tenant should cancel any work in flight.
-#[derive(Debug)]
-pub(crate) struct SecondaryTenant {
-    /// Cancellation token indicates to SecondaryDownloader that it should stop doing
-    /// any work for this tenant at the next opportunity.
-    pub(crate) cancel: CancellationToken,
-
-    /// Lock must be held by SecondaryDownloader at any time that it might be operating
-    /// on the local filesystem directory for this tenant ID.
-    // Ordering: the TenantManager must set the cancellation token _before_
-    // taking the lock.  The SecondaryDownloader must always check the cancellation
-    // token immediately _after_ taking the lock (and at appropriate intervals
-    // while holding it).
-    pub(crate) busy: Arc<tokio::sync::Mutex<()>>,
-
-    detail: std::sync::Mutex<SecondaryDetail>,
-    // TODO: propagate the `warm` from LocationConf into here, and respect it when doing downloads
-}
-
-impl SecondaryTenant {
-    pub(crate) fn new() -> Arc<Self> {
-        // TODO; consider whether we really need to Arc this
-        Arc::new(Self {
-            busy: Arc::new(tokio::sync::Mutex::new(())),
-            // todo: shall we make this a descendent of the
-            // main cancellation token, or is it sufficient that
-            // on shutdown we walk the tenants and fire their
-            // individual cancellations?
-            cancel: CancellationToken::new(),
-
-            detail: std::sync::Mutex::default(),
-        })
-    }
-
-    pub(crate) async fn shutdown(&self) {
-        self.cancel.cancel();
-
-        // Wait for any secondary downloader work to complete: once we
-        // acquire this lock, we are guaranteed that the secondary downloader
-        // won't touch the local filesystem again for this instance: it is safe
-        // to e.g. construct a `Tenant` for the same TenantId
-        drop(self.busy.lock().await);
-    }
-
-    pub(crate) fn get_layers_for_eviction(&self) -> Vec<(TimelineId, DiskUsageEvictionInfo)> {
-        self.detail.lock().unwrap().get_layers_for_eviction()
-    }
-
-    pub(crate) async fn evict_layers(
-        &self,
-        _guard: tokio::sync::OwnedMutexGuard<()>,
-        conf: &PageServerConf,
-        tenant_id: &TenantId,
-        layers: Vec<(TimelineId, Layer)>,
-    ) {
-        crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
-
-        if self.cancel.is_cancelled() {
-            // Eviction is a no-op if shutdown() was already called.
-            tracing::info!(
-                "Dropping {} layer evictions, secondary tenant shutting down",
-                layers.len()
-            );
-            return;
-        }
-
-        let now = SystemTime::now();
-
-        for (timeline_id, layer) in layers {
-            let layer_name = layer.layer_desc().filename();
-            let path = conf
-                .timeline_path(tenant_id, &timeline_id)
-                .join(&layer_name.file_name());
-
-            // We tolerate ENOENT, because between planning eviction and executing
-            // it, the secondary downloader could have seen an updated heatmap that
-            // resulted in a layer being deleted.
-            tokio::fs::remove_file(path)
-                .await
-                .or_else(fs_ext::ignore_not_found)
-                .expect("TODO: terminate process on local I/O errors");
-
-            // TODO: batch up updates instead of acquiring lock in inner loop
-            let mut detail = self.detail.lock().unwrap();
-            // If there is no timeline detail for what we just deleted, that indicates that
-            // the secondary downloader did some work (perhaps removing all)
-            if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
-                timeline_detail.on_disk_layers.remove(&layer_name);
-                timeline_detail.evicted_at.insert(layer_name, now);
-            }
-        }
-    }
-}
-
-/// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
-/// and heatmap uploads.  This is not a hot data path: it's primarily a hook for tests,
-/// where we want to immediately upload/download for a particular tenant.  In normal operation
-/// uploads & downloads are autonomous and not driven by this interface.
-pub struct SecondaryController {
-    upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
-
-    download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
-}
-
-impl SecondaryController {
-    async fn dispatch<T>(
-        &self,
-        queue: &tokio::sync::mpsc::Sender<CommandRequest<T>>,
-        payload: T,
-    ) -> anyhow::Result<()> {
-        let (response_tx, response_rx) = tokio::sync::oneshot::channel();
-
-        queue
-            .send(CommandRequest {
-                payload,
-                response_tx,
-            })
-            .await
-            .map_err(|_| anyhow::anyhow!("Receiver shut down"))?;
-
-        let response = response_rx
-            .await
-            .map_err(|_| anyhow::anyhow!("Request dropped"))?;
-
-        response.result
-    }
-
-    pub async fn download_tenant(&self, tenant_id: TenantId) -> anyhow::Result<()> {
-        self.dispatch(&self.download_req_tx, DownloadCommand::Download(tenant_id))
-            .await
-    }
-
-    pub async fn upload_tenant(&self, tenant_id: TenantId) -> anyhow::Result<()> {
-        self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_id))
-            .await
-    }
-}
-
-pub fn spawn_tasks(
-    conf: &'static PageServerConf,
-    tenant_manager: Arc<TenantManager>,
-    remote_storage: GenericRemoteStorage,
-    background_jobs_can_start: Barrier,
-    cancel: CancellationToken,
-) -> SecondaryController {
-    let mgr_clone = tenant_manager.clone();
-    let storage_clone = remote_storage.clone();
-    let cancel_clone = cancel.clone();
-    let bg_jobs_clone = background_jobs_can_start.clone();
-
-    let (download_req_tx, download_req_rx) =
-        tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
-    let (upload_req_tx, upload_req_rx) =
-        tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
-
-    task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
-        TaskKind::SecondaryDownloads,
-        None,
-        None,
-        "secondary tenant downloads",
-        false,
-        async move {
-            downloader_task(
-                conf,
-                mgr_clone,
-                storage_clone,
-                download_req_rx,
-                bg_jobs_clone,
-                cancel_clone,
-            )
-            .await
-        },
-    );
-
-    task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
-        TaskKind::SecondaryDownloads,
-        None,
-        None,
-        "heatmap uploads",
-        false,
-        async move {
-            heatmap_writer_task(
-                tenant_manager,
-                remote_storage,
-                upload_req_rx,
-                background_jobs_can_start,
-                cancel,
-            )
-            .await
-        },
-    );
-
-    SecondaryController {
-        download_req_tx,
-        upload_req_tx,
-    }
-}
-
-/// For running with remote storage disabled: a SecondaryController that is connected to nothing.
-pub fn null_controller() -> SecondaryController {
-    let (download_req_tx, _download_req_rx) =
-        tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
-    let (upload_req_tx, _upload_req_rx) =
-        tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
-    SecondaryController {
-        upload_req_tx,
-        download_req_tx,
-    }
-}
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -1,579 +0,0 @@
-use std::{
-    collections::{HashMap, HashSet},
-    str::FromStr,
-    sync::Arc,
-    time::{Duration, Instant, SystemTime},
-};
-
-use crate::{
-    config::PageServerConf,
-    tenant::{
-        remote_timeline_client::index::LayerFileMetadata,
-        secondary::CommandResponse,
-        storage_layer::{Layer, LayerFileName},
-        timeline::{DiskUsageEvictionInfo, LocalLayerInfoForDiskUsageEviction},
-    },
-    METADATA_FILE_NAME,
-};
-
-use super::SecondaryTenant;
-use crate::tenant::{
-    mgr::TenantManager,
-    remote_timeline_client::{download::download_layer_file, remote_heatmap_path},
-};
-use anyhow::Context;
-
-use chrono::format::{DelayedFormat, StrftimeItems};
-use remote_storage::GenericRemoteStorage;
-
-use tokio_util::sync::CancellationToken;
-use tracing::Instrument;
-use utils::{
-    completion::Barrier,
-    fs_ext,
-    id::{TenantId, TimelineId},
-};
-
-use super::{
-    heatmap::{HeatMapTenant, HeatMapTimeline},
-    CommandRequest, DownloadCommand,
-};
-
-/// Interval between checking if any Secondary tenants have download work to do:
-/// note that this is _not_ the frequency with which we actually freshen the tenants,
-/// just the frequency with which we wake up to decide whether anyone needs freshening.
-///
-/// Making this somewhat infrequent reduces the load on mutexes inside TenantManager
-/// and SecondaryTenant for reads when checking for work to do.
-const DOWNLOAD_CHECK_INTERVAL: Duration = Duration::from_millis(10000);
-
-/// For each tenant, how long must have passed since the last freshen_tenant call before
-/// calling it again.  This is approximately the time by which local data is allowed
-/// to fall behind remote data.
-///
-/// TODO: this should be an upper bound, and tenants that are uploading regularly
-/// should adaptively freshen more often (e.g. a tenant writing 1 layer per second
-/// should not wait a minute between freshens)
-const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
-
-#[derive(Debug, Clone)]
-pub(super) struct OnDiskState {
-    layer: Layer,
-    access_time: SystemTime,
-}
-
-impl OnDiskState {
-    fn new(
-        conf: &'static PageServerConf,
-        tenant_id: &TenantId,
-        timeline_id: &TimelineId,
-        name: LayerFileName,
-        metadata: LayerFileMetadata,
-        access_time: SystemTime,
-    ) -> Self {
-        Self {
-            layer: Layer::for_secondary(conf, tenant_id, timeline_id, name, metadata),
-            access_time,
-        }
-    }
-}
-
-#[derive(Debug, Clone, Default)]
-pub(super) struct SecondaryDetailTimeline {
-    pub(super) on_disk_layers: HashMap<LayerFileName, OnDiskState>,
-
-    /// We remember when layers were evicted, to prevent re-downloading them.
-    /// TODO: persist this, so that we don't try and re-download everything on restart.
-    pub(super) evicted_at: HashMap<LayerFileName, SystemTime>,
-}
-
-/// This state is written by the secondary downloader, it is opaque
-/// to TenantManager
-#[derive(Default, Debug)]
-pub(super) struct SecondaryDetail {
-    freshened_at: Option<Instant>,
-    pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
-}
-
-/// Helper for logging SystemTime
-fn strftime(t: &'_ SystemTime) -> DelayedFormat<StrftimeItems<'_>> {
-    let datetime: chrono::DateTime<chrono::Utc> = (*t).into();
-    datetime.format("%d/%m/%Y %T")
-}
-
-impl SecondaryDetail {
-    pub(super) fn get_layers_for_eviction(&self) -> Vec<(TimelineId, DiskUsageEvictionInfo)> {
-        let mut result = Vec::new();
-        for (timeline_id, timeline_detail) in &self.timelines {
-            let layers: Vec<_> = timeline_detail
-                .on_disk_layers
-                .values()
-                .map(|ods| LocalLayerInfoForDiskUsageEviction {
-                    layer: ods.layer.clone(),
-                    last_activity_ts: ods.access_time,
-                })
-                .collect();
-
-            let max_layer_size = layers.iter().map(|l| l.layer.metadata().file_size()).max();
-
-            result.push((
-                *timeline_id,
-                DiskUsageEvictionInfo {
-                    resident_layers: layers,
-                    max_layer_size,
-                },
-            ))
-        }
-
-        result
-    }
-}
-
-/// Keep trying to do downloads until the cancellation token is fired.  Remote storage
-/// errors are handled internally: any error returned by this function is an unexpected
-/// internal error of some kind.
-pub(super) async fn downloader_task(
-    conf: &'static PageServerConf,
-    tenant_manager: Arc<TenantManager>,
-    remote_storage: GenericRemoteStorage,
-    mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<DownloadCommand>>,
-    background_jobs_can_start: Barrier,
-    cancel: CancellationToken,
-) -> anyhow::Result<()> {
-    let downloader = SecondaryDownloader {
-        conf,
-        tenant_manager,
-        remote_storage,
-        cancel: cancel.clone(),
-    };
-
-    tracing::info!("Waiting for background_jobs_can start...");
-    background_jobs_can_start.wait().await;
-    tracing::info!("background_jobs_can is ready, proceeding.");
-
-    while !cancel.is_cancelled() {
-        downloader.iteration().await?;
-
-        tokio::select! {
-            _ = cancel.cancelled() => {
-                tracing::info!("Heatmap writer terminating");
-                break;
-            },
-            _ = tokio::time::sleep(DOWNLOAD_CHECK_INTERVAL) => {},
-            cmd = command_queue.recv() => {
-                let cmd = match cmd {
-                    Some(c) =>c,
-                    None => {
-                        // SecondaryController was destroyed, and this has raced with
-                        // our CancellationToken
-                        tracing::info!("Heatmap writer terminating");
-                        break;
-                    }
-                };
-
-                let CommandRequest{
-                    response_tx,
-                    payload
-                } = cmd;
-                let result = downloader.handle_command(payload).await;
-                if response_tx.send(CommandResponse{result}).is_err() {
-                    // Caller went away, e.g. because an HTTP request timed out
-                    tracing::info!("Dropping response to administrative command")
-                }
-            }
-        }
-    }
-
-    Ok(())
-}
-struct SecondaryDownloader {
-    conf: &'static PageServerConf,
-    tenant_manager: Arc<TenantManager>,
-    remote_storage: GenericRemoteStorage,
-    cancel: CancellationToken,
-}
-
-struct TenantJob {
-    tenant_id: TenantId,
-    secondary_state: Arc<SecondaryTenant>,
-
-    // This mutex guard conveys the right to write to the tenant's local directory: it must
-    // be taken before doing downloads, and TenantManager must ensure it has been released
-    // before it considers shutdown complete for the secondary state -- [`SecondaryDownloader`]
-    // will thereby never be racing with [`Tenant`] for access to local files.
-    _guard: tokio::sync::OwnedMutexGuard<()>,
-}
-
-impl SecondaryDownloader {
-    async fn iteration(&self) -> anyhow::Result<()> {
-        // Step 1: identify some tenants that we may work on
-        let mut candidates: Vec<TenantJob> = Vec::new();
-        self.tenant_manager
-            .foreach_secondary_tenants(|tenant_id, secondary_state| {
-                let guard = match secondary_state.busy.clone().try_lock_owned() {
-                    Ok(guard) => guard,
-                    // If we can't lock, someone is in the process of shutting it down, or we are
-                    // already working on it.  We may ignore it when scanning for new work to do.
-                    Err(_) => return,
-                };
-
-                candidates.push(TenantJob {
-                    tenant_id: *tenant_id,
-                    secondary_state: secondary_state.clone(),
-                    _guard: guard,
-                });
-            });
-
-        // Step 2: prioritized selection of next batch of tenants to freshen
-        let now = Instant::now();
-        let candidates = candidates.into_iter().filter(|c| {
-            let detail = c.secondary_state.detail.lock().unwrap();
-            match detail.freshened_at {
-                None => true, // Not yet freshened, therefore elegible to run
-                Some(t) => {
-                    let since = now.duration_since(t);
-                    since > DOWNLOAD_FRESHEN_INTERVAL
-                }
-            }
-        });
-
-        // TODO: don't just cut down the list, prioritize it to freshen the stalest tenants first
-        // TODO: bounded parallelism
-
-        // Step 3: spawn freshen_tenant tasks
-        for job in candidates {
-            if job.secondary_state.cancel.is_cancelled() {
-                continue;
-            }
-
-            async {
-                if let Err(e) = self.freshen_tenant(&job).await {
-                    tracing::info!("Failed to freshen secondary content: {e:#}")
-                };
-
-                // Update freshened_at even if there was an error: we don't want errored tenants to implicitly
-                // take priority to run again.
-                let mut detail = job.secondary_state.detail.lock().unwrap();
-                detail.freshened_at = Some(Instant::now());
-            }
-            .instrument(tracing::info_span!(
-                "freshen_tenant",
-                tenant_id = %job.tenant_id
-            ))
-            .await;
-        }
-
-        Ok(())
-    }
-
-    async fn handle_command(&self, command: DownloadCommand) -> anyhow::Result<()> {
-        match command {
-            DownloadCommand::Download(req_tenant_id) => {
-                let mut candidates: Vec<TenantJob> = Vec::new();
-                self.tenant_manager
-                    .foreach_secondary_tenants(|tenant_id, secondary_state| {
-                        tracing::info!("foreach_secondary: {tenant_id} ({req_tenant_id})");
-                        if tenant_id == &req_tenant_id {
-                            let guard = match secondary_state.busy.clone().try_lock_owned() {
-                                Ok(guard) => guard,
-                                // If we can't lock, someone is in the process of shutting it down, or we are
-                                // already working on it.  We may ignore it when scanning for new work to do.
-                                Err(_) => return,
-                            };
-
-                            candidates.push(TenantJob {
-                                tenant_id: *tenant_id,
-                                secondary_state: secondary_state.clone(),
-                                _guard: guard,
-                            });
-                        }
-                    });
-
-                let tenant_job = if candidates.len() != 1 {
-                    anyhow::bail!("Tenant not found in secondary mode");
-                } else {
-                    candidates.pop().unwrap()
-                };
-
-                self.freshen_tenant(&tenant_job).await
-            }
-        }
-    }
-
-    async fn download_heatmap(&self, tenant_id: &TenantId) -> anyhow::Result<HeatMapTenant> {
-        // TODO: make download conditional on ETag having changed since last download
-
-        let heatmap_path = remote_heatmap_path(tenant_id);
-        // TODO: wrap this download in a select! that checks self.cancel
-        let mut download = self.remote_storage.download(&heatmap_path).await?;
-        let mut heatmap_bytes = Vec::new();
-        let _size = tokio::io::copy(&mut download.download_stream, &mut heatmap_bytes)
-            .await
-            .with_context(|| format!("download heatmap {heatmap_path:?}"))?;
-
-        Ok(serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?)
-    }
-
-    async fn init_timeline_state(
-        &self,
-        tenant_id: &TenantId,
-        timeline_id: &TimelineId,
-        heatmap: &HeatMapTimeline,
-    ) -> anyhow::Result<SecondaryDetailTimeline> {
-        let timeline_path = self.conf.timeline_path(tenant_id, timeline_id);
-        let mut detail = SecondaryDetailTimeline::default();
-
-        let mut dir = match tokio::fs::read_dir(&timeline_path).await {
-            Ok(d) => d,
-            Err(e) => {
-                if e.kind() == std::io::ErrorKind::NotFound {
-                    tracing::info!("Creating timeline directory {timeline_path}");
-                    tokio::fs::create_dir(&timeline_path).await?;
-
-                    // No entries to report: drop out.
-                    return Ok(detail);
-                } else {
-                    return Err(e.into());
-                }
-            }
-        };
-
-        let heatmap_metadata: HashMap<_, _> = heatmap.layers.iter().map(|l| (&l.name, l)).collect();
-
-        while let Some(dentry) = dir.next_entry().await? {
-            let dentry_file_name = dentry.file_name();
-            let file_name = dentry_file_name.to_string_lossy();
-            let local_meta = dentry.metadata().await?;
-
-            // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
-            if file_name == METADATA_FILE_NAME {
-                continue;
-            }
-
-            match LayerFileName::from_str(&file_name) {
-                Ok(name) => {
-                    let remote_meta = heatmap_metadata.get(&name);
-                    match remote_meta {
-                        Some(remote_meta) => {
-                            // TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784)
-                            if local_meta.len() != remote_meta.metadata.file_size {
-                                // This should not happen, because we do crashsafe write-then-rename when downloading
-                                // layers, and layers in remote storage are immutable.  Remove the local file because
-                                // we cannot trust it.
-                                tracing::warn!("Removing local layer {name} with unexpected local size {} != {}",
-                                    local_meta.len(), remote_meta.metadata.file_size);
-                            } else {
-                                // We expect the access time to be initialized immediately afterwards, when
-                                // the latest heatmap is applied to the state.
-                                detail.on_disk_layers.insert(
-                                    name.clone(),
-                                    OnDiskState::new(
-                                        self.conf,
-                                        tenant_id,
-                                        timeline_id,
-                                        name,
-                                        LayerFileMetadata::from(&remote_meta.metadata),
-                                        remote_meta.access_time,
-                                    ),
-                                );
-                            }
-                        }
-                        None => {
-                            // FIXME: consider some optimization when transitioning from attached to secondary: maybe
-                            // wait until we have seen a heatmap that is more recent than the most recent on-disk state?  Otherwise
-                            // we will end up deleting any layers which were created+uploaded more recently than the heatmap.
-                            tracing::info!(
-                                "Removing secondary local layer {} because it's absent in heatmap",
-                                name
-                            );
-                            tokio::fs::remove_file(dentry.path()).await?;
-                        }
-                    }
-                }
-                Err(_) => {
-                    // Ignore it.
-                    tracing::warn!("Unexpected file in timeline directory: {file_name}");
-                }
-            }
-        }
-
-        Ok(detail)
-    }
-
-    async fn freshen_timeline(
-        &self,
-        job: &TenantJob,
-        timeline: HeatMapTimeline,
-    ) -> anyhow::Result<()> {
-        let timeline_path = self
-            .conf
-            .timeline_path(&job.tenant_id, &timeline.timeline_id);
-
-        // Accumulate updates to the state
-        let mut touched = Vec::new();
-
-        // Clone a view of what layers already exist on disk
-        let timeline_state = job
-            .secondary_state
-            .detail
-            .lock()
-            .unwrap()
-            .timelines
-            .get(&timeline.timeline_id)
-            .cloned();
-
-        let timeline_state = match timeline_state {
-            Some(t) => t,
-            None => {
-                // We have no existing state: need to scan local disk for layers first.
-                self.init_timeline_state(&job.tenant_id, &timeline.timeline_id, &timeline)
-                    .await?
-            }
-        };
-
-        let layers_in_heatmap = timeline
-            .layers
-            .iter()
-            .map(|l| &l.name)
-            .collect::<HashSet<_>>();
-        let layers_on_disk = timeline_state
-            .on_disk_layers
-            .iter()
-            .map(|l| l.0)
-            .collect::<HashSet<_>>();
-
-        // Remove on-disk layers that are no longer present in heatmap
-        for layer in layers_on_disk.difference(&layers_in_heatmap) {
-            let local_path = timeline_path.join(layer.to_string());
-            tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",);
-            tokio::fs::remove_file(&local_path)
-                .await
-                .or_else(fs_ext::ignore_not_found)?;
-        }
-
-        // Download heatmap layers that are not present on local disk, or update their
-        // access time if they are already present.
-        for layer in timeline.layers {
-            if self.cancel.is_cancelled() {
-                return Ok(());
-            }
-
-            // Existing on-disk layers: just update their access time.
-            if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
-                if on_disk.layer.metadata() != LayerFileMetadata::from(&layer.metadata)
-                    || on_disk.access_time != layer.access_time
-                {
-                    // We already have this layer on disk.  Update its access time.
-                    tracing::trace!(
-                        "Access time updated for layer {}: {} -> {}",
-                        layer.name,
-                        strftime(&on_disk.access_time),
-                        strftime(&layer.access_time)
-                    );
-                    touched.push(layer);
-                }
-                continue;
-            }
-
-            // Eviction: if we evicted a layer, then do not re-download it unless it was accessed more
-            // recently than it was evicted.
-            if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) {
-                if &layer.access_time > evicted_at {
-                    tracing::info!(
-                        "Re-downloading evicted layer {}, accessed at {}, evicted at {}",
-                        layer.name,
-                        strftime(&layer.access_time),
-                        strftime(evicted_at)
-                    );
-                } else {
-                    tracing::trace!(
-                        "Not re-downloading evicted layer {}, accessed at {}, evicted at {}",
-                        layer.name,
-                        strftime(&layer.access_time),
-                        strftime(evicted_at)
-                    );
-                    continue;
-                }
-            }
-
-            match download_layer_file(
-                self.conf,
-                &self.remote_storage,
-                job.tenant_id,
-                timeline.timeline_id,
-                &layer.name,
-                &LayerFileMetadata::from(&layer.metadata),
-            )
-            .await
-            {
-                Ok(downloaded_bytes) => {
-                    if downloaded_bytes != layer.metadata.file_size {
-                        let local_path = timeline_path.join(layer.name.to_string());
-
-                        tracing::error!(
-                            "Downloaded layer {} with unexpected size {} != {}",
-                            layer.name,
-                            downloaded_bytes,
-                            layer.metadata.file_size
-                        );
-
-                        tokio::fs::remove_file(&local_path)
-                            .await
-                            .or_else(fs_ext::ignore_not_found)?;
-                    }
-
-                    touched.push(layer)
-                }
-                Err(e) => {
-                    // No retries here: secondary downloads don't have to succeed: if they fail we just proceed and expect
-                    // that on some future call to freshen the download will work.
-                    // TODO: refine this behavior.
-                    tracing::info!("Failed to download layer {}: {}", layer.name, e);
-                }
-            }
-        }
-
-        // Write updates to state to record layers we just downloaded or touched.
-        {
-            let mut detail = job.secondary_state.detail.lock().unwrap();
-            let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default();
-
-            for t in touched {
-                use std::collections::hash_map::Entry;
-                match timeline_detail.on_disk_layers.entry(t.name.clone()) {
-                    Entry::Occupied(mut v) => {
-                        v.get_mut().access_time = t.access_time;
-                    }
-                    Entry::Vacant(e) => {
-                        e.insert(OnDiskState::new(
-                            self.conf,
-                            &job.tenant_id,
-                            &timeline.timeline_id,
-                            t.name,
-                            LayerFileMetadata::from(&t.metadata),
-                            t.access_time,
-                        ));
-                    }
-                }
-            }
-        }
-
-        Ok(())
-    }
-
-    async fn freshen_tenant(&self, job: &TenantJob) -> anyhow::Result<()> {
-        // Download the tenant's heatmap
-        let heatmap = self.download_heatmap(&job.tenant_id).await?;
-
-        // Download the layers in the heatmap
-        for timeline in heatmap.timelines {
-            if self.cancel.is_cancelled() {
-                return Ok(());
-            }
-
-            self.freshen_timeline(job, timeline).await?;
-        }
-
-        Ok(())
-    }
-}
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -1,57 +0,0 @@
-use std::time::SystemTime;
-
-use crate::tenant::{
-    remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerFileName,
-};
-
-use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
-
-use utils::id::TimelineId;
-
-#[derive(Serialize, Deserialize)]
-pub(super) struct HeatMapTenant {
-    pub(super) timelines: Vec<HeatMapTimeline>,
-}
-
-#[derive(Serialize, Deserialize)]
-pub(crate) struct HeatMapLayer {
-    pub(super) name: LayerFileName,
-    pub(super) metadata: IndexLayerMetadata,
-
-    pub(super) access_time: SystemTime,
-    // TODO: an actual 'heat' score that would let secondary locations prioritize downloading
-    // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
-}
-
-impl HeatMapLayer {
-    pub(crate) fn new(
-        name: LayerFileName,
-        metadata: IndexLayerMetadata,
-        access_time: SystemTime,
-    ) -> Self {
-        Self {
-            name,
-            metadata,
-            access_time,
-        }
-    }
-}
-
-#[serde_as]
-#[derive(Serialize, Deserialize)]
-pub(crate) struct HeatMapTimeline {
-    #[serde_as(as = "DisplayFromStr")]
-    pub(super) timeline_id: TimelineId,
-
-    pub(super) layers: Vec<HeatMapLayer>,
-}
-
-impl HeatMapTimeline {
-    pub(crate) fn new(timeline_id: TimelineId, layers: Vec<HeatMapLayer>) -> Self {
-        Self {
-            timeline_id,
-            layers,
-        }
-    }
-}
--- a/pageserver/src/tenant/secondary/heatmap_writer.rs
+++ b/pageserver/src/tenant/secondary/heatmap_writer.rs
@@ -1,207 +0,0 @@
-use std::{collections::HashMap, sync::Arc, time::Duration};
-
-use crate::tenant::{
-    mgr::TenantManager, remote_timeline_client::remote_heatmap_path, secondary::CommandResponse,
-    Tenant,
-};
-
-use pageserver_api::models::TenantState;
-use remote_storage::GenericRemoteStorage;
-
-use tokio_util::sync::CancellationToken;
-use tracing::Instrument;
-use utils::{backoff, completion::Barrier};
-
-use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand};
-
-const HEATMAP_UPLOAD_INTERVAL: Duration = Duration::from_millis(60000);
-
-pub(super) async fn heatmap_writer_task(
-    tenant_manager: Arc<TenantManager>,
-    remote_storage: GenericRemoteStorage,
-    mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
-    background_jobs_can_start: Barrier,
-    cancel: CancellationToken,
-) -> anyhow::Result<()> {
-    let writer = HeatmapWriter {
-        tenant_manager,
-        remote_storage,
-        cancel: cancel.clone(),
-    };
-
-    tracing::info!("Waiting for background_jobs_can start...");
-    background_jobs_can_start.wait().await;
-    tracing::info!("background_jobs_can is ready, proceeding.");
-
-    while !cancel.is_cancelled() {
-        writer.iteration().await?;
-
-        tokio::select! {
-            _ = cancel.cancelled() => {
-                tracing::info!("Heatmap writer terminating");
-                break;
-            },
-            _ = tokio::time::sleep(HEATMAP_UPLOAD_INTERVAL) => {},
-            cmd = command_queue.recv() => {
-                let cmd = match cmd {
-                    Some(c) =>c,
-                    None => {
-                        // SecondaryController was destroyed, and this has raced with
-                        // our CancellationToken
-                        tracing::info!("Heatmap writer terminating");
-                        break;
-                    }
-                };
-
-                let CommandRequest{
-                    response_tx,
-                    payload
-                } = cmd;
-                let result = writer.handle_command(payload).await;
-                if response_tx.send(CommandResponse{result}).is_err() {
-                    // Caller went away, e.g. because an HTTP request timed out
-                    tracing::info!("Dropping response to administrative command")
-                }
-            }
-        }
-    }
-
-    Ok(())
-}
-
-struct HeatmapWriter {
-    tenant_manager: Arc<TenantManager>,
-    remote_storage: GenericRemoteStorage,
-    cancel: CancellationToken,
-}
-
-impl HeatmapWriter {
-    async fn iteration(&self) -> anyhow::Result<()> {
-        let tenants = self.tenant_manager.get_attached_tenants();
-
-        for tenant in tenants {
-            if self.cancel.is_cancelled() {
-                return Ok(());
-            }
-
-            if tenant.current_state() != TenantState::Active {
-                continue;
-            }
-
-            // TODO: add a mechanism to check whether the active layer set has
-            // changed since our last write
-
-            // TODO: add a minimum time between uploads
-
-            match self
-                .write_tenant(&tenant)
-                .instrument(tracing::info_span!(
-                    "write_tenant",
-                    tenant_id = %tenant.get_tenant_id()
-                ))
-                .await
-            {
-                Ok(()) => {}
-                Err(e) => {
-                    tracing::warn!(
-                        "Failed to upload heatmap for tenant {}: {e:#}",
-                        tenant.get_tenant_id(),
-                    )
-                }
-            }
-        }
-
-        Ok(())
-    }
-
-    async fn handle_command(&self, command: UploadCommand) -> anyhow::Result<()> {
-        match command {
-            UploadCommand::Upload(tenant_id) => {
-                let tenants = self.tenant_manager.get_attached_tenants();
-
-                let map = tenants
-                    .iter()
-                    .map(|t| (t.get_tenant_id(), t))
-                    .collect::<HashMap<_, _>>();
-                match map.get(&tenant_id) {
-                    Some(tenant) => self.write_tenant(tenant).await,
-                    None => {
-                        anyhow::bail!("Tenant is not attached");
-                    }
-                }
-            }
-        }
-    }
-
-    async fn write_tenant(&self, tenant: &Arc<Tenant>) -> anyhow::Result<()> {
-        let mut heatmap = HeatMapTenant {
-            timelines: Vec::new(),
-        };
-        let timelines = tenant.timelines.lock().unwrap().clone();
-
-        let tenant_cancel = tenant.cancel.clone();
-
-        // Ensure that Tenant::shutdown waits for any upload in flight
-        let _guard = {
-            let hook = tenant.heatmap_hook.lock().unwrap();
-            match hook.enter() {
-                Some(g) => g,
-                None => {
-                    // Tenant is shutting down
-                    tracing::info!("Skipping, tenant is shutting down");
-                    return Ok(());
-                }
-            }
-        };
-
-        for (timeline_id, timeline) in timelines {
-            let heatmap_timeline = timeline.generate_heatmap().await;
-            match heatmap_timeline {
-                None => {
-                    tracing::debug!(
-                        "Skipping heatmap upload because timeline {timeline_id} is not ready"
-                    );
-                    return Ok(());
-                }
-                Some(heatmap_timeline) => {
-                    heatmap.timelines.push(heatmap_timeline);
-                }
-            }
-        }
-
-        // Serialize the heatmap
-        let bytes = serde_json::to_vec(&heatmap)?;
-        let size = bytes.len();
-
-        let path = remote_heatmap_path(&tenant.get_tenant_id());
-
-        // Write the heatmap.
-        tracing::debug!("Uploading {size} byte heatmap to {path}");
-        if let Err(e) = backoff::retry(
-            || async {
-                let bytes = tokio::io::BufReader::new(std::io::Cursor::new(bytes.clone()));
-                let bytes = Box::new(bytes);
-                self.remote_storage
-                    .upload_storage_object(bytes, size, &path)
-                    .await
-            },
-            |_| false,
-            3,
-            u32::MAX,
-            "Uploading heatmap",
-            backoff::Cancel::new(tenant_cancel.clone(), || anyhow::anyhow!("Shutting down")),
-        )
-        .await
-        {
-            if tenant_cancel.is_cancelled() {
-                return Ok(());
-            } else {
-                return Err(e);
-            }
-        }
-
-        tracing::info!("Successfully uploading {size} byte heatmap to {path}");
-
-        Ok(())
-    }
-}
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -4,21 +4,26 @@ pub mod delta_layer;
 mod filename;
 mod image_layer;
 mod inmemory_layer;
-mod layer;
 mod layer_desc;
+mod remote_layer;

+use crate::config::PageServerConf;
 use crate::context::{AccessStatsBehavior, RequestContext};
+use crate::repository::Key;
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
+use anyhow::Result;
 use bytes::Bytes;
+use camino::Utf8PathBuf;
 use enum_map::EnumMap;
 use enumset::EnumSet;
 use once_cell::sync::Lazy;
+use pageserver_api::models::LayerAccessKind;
 use pageserver_api::models::{
-    LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
+    HistoricLayerInfo, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
 };
 use std::ops::Range;
-use std::sync::Mutex;
+use std::sync::{Arc, Mutex};
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 use tracing::warn;
 use utils::history_buffer::HistoryBufferWithDropCounter;
@@ -34,8 +39,7 @@ pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
 pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
-
-pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
+pub use remote_layer::RemoteLayer;

 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
@@ -70,7 +74,7 @@ pub struct ValueReconstructState {
    pub img: Option<(Lsn, Bytes)>,
 }

-/// Return value from [`Layer::get_value_reconstruct_data`]
+/// Return value from Layer::get_page_reconstruct_data
 #[derive(Clone, Copy, Debug)]
 pub enum ValueReconstructResult {
    /// Got all the data needed to reconstruct the requested page
@@ -175,6 +179,26 @@ impl LayerAccessStats {
        new
    }

+    /// Creates a clone of `self` and records `new_status` in the clone.
+    ///
+    /// The `new_status` is not recorded in `self`.
+    ///
+    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
+    ///
+    /// [`record_residence_event`]: Self::record_residence_event
+    pub(crate) fn clone_for_residence_change(
+        &self,
+        new_status: LayerResidenceStatus,
+    ) -> LayerAccessStats {
+        let clone = {
+            let inner = self.0.lock().unwrap();
+            inner.clone()
+        };
+        let new = LayerAccessStats(Mutex::new(clone));
+        new.record_residence_event(new_status, LayerResidenceEventReason::ResidenceChange);
+        new
+    }
+
    /// Record a change in layer residency.
    ///
    /// Recording the event must happen while holding the layer map lock to
@@ -297,12 +321,95 @@ impl LayerAccessStats {
    }
 }

+/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
+/// required by [`LayerMap`](super::layer_map::LayerMap).
+///
+/// All layers should implement a minimal `std::fmt::Debug` without tenant or
+/// timeline names, because those are known in the context of which the layers
+/// are used in (timeline).
+#[async_trait::async_trait]
+pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
+    ///
+    /// Return data needed to reconstruct given page at LSN.
+    ///
+    /// It is up to the caller to collect more data from previous layer and
+    /// perform WAL redo, if necessary.
+    ///
+    /// See PageReconstructResult for possible return values. The collected data
+    /// is appended to reconstruct_data; the caller should pass an empty struct
+    /// on first call, or a struct with a cached older image of the page if one
+    /// is available. If this returns ValueReconstructResult::Continue, look up
+    /// the predecessor layer and call again with the same 'reconstruct_data' to
+    /// collect more data.
+    async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_data: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<ValueReconstructResult>;
+}
+
 /// Get a layer descriptor from a layer.
 pub trait AsLayerDesc {
    /// Get the layer descriptor.
    fn layer_desc(&self) -> &PersistentLayerDesc;
 }

+/// A Layer contains all data in a "rectangle" consisting of a range of keys and
+/// range of LSNs.
+///
+/// There are two kinds of layers, in-memory and on-disk layers. In-memory
+/// layers are used to ingest incoming WAL, and provide fast access to the
+/// recent page versions. On-disk layers are stored as files on disk, and are
+/// immutable. This trait presents the common functionality of in-memory and
+/// on-disk layers.
+///
+/// Furthermore, there are two kinds of on-disk layers: delta and image layers.
+/// A delta layer contains all modifications within a range of LSNs and keys.
+/// An image layer is a snapshot of all the data in a key-range, at a single
+/// LSN.
+pub trait PersistentLayer: Layer + AsLayerDesc {
+    /// File name used for this layer, both in the pageserver's local filesystem
+    /// state as well as in the remote storage.
+    fn filename(&self) -> LayerFileName {
+        self.layer_desc().filename()
+    }
+
+    // Path to the layer file in the local filesystem.
+    // `None` for `RemoteLayer`.
+    fn local_path(&self) -> Option<Utf8PathBuf>;
+
+    /// Permanently remove this layer from disk.
+    fn delete_resident_layer_file(&self) -> Result<()>;
+
+    fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
+        None
+    }
+
+    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
+        None
+    }
+
+    fn is_remote_layer(&self) -> bool {
+        false
+    }
+
+    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;
+
+    fn access_stats(&self) -> &LayerAccessStats;
+}
+
+pub fn downcast_remote_layer(
+    layer: &Arc<dyn PersistentLayer>,
+) -> Option<std::sync::Arc<RemoteLayer>> {
+    if layer.is_remote_layer() {
+        Arc::clone(layer).downcast_remote_layer()
+    } else {
+        None
+    }
+}
+
 pub mod tests {
    use super::*;

@@ -340,6 +447,19 @@ pub mod tests {
    }
 }

+/// Helper enum to hold a PageServerConf, or a path
+///
+/// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the
+/// global config, and paths to layer files are constructed using the tenant/timeline
+/// path from the config. But in the 'pagectl' binary, we need to construct a Layer
+/// struct for a file on disk, without having a page server running, so that we have no
+/// config. In that case, we use the Path variant to hold the full path to the file on
+/// disk.
+enum PathOrConf {
+    Path(Utf8PathBuf),
+    Conf(&'static PageServerConf),
+}
+
 /// Range wrapping newtype, which uses display to render Debug.
 ///
 /// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -34,17 +34,18 @@ use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
-use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
-use crate::tenant::Timeline;
+use crate::tenant::storage_layer::{
+    PersistentLayer, ValueReconstructResult, ValueReconstructState,
+};
 use crate::virtual_file::VirtualFile;
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
 use camino::{Utf8Path, Utf8PathBuf};
-use pageserver_api::models::LayerAccessKind;
+use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs::File;
+use std::fs::{self, File};
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
@@ -58,7 +59,10 @@ use utils::{
    lsn::Lsn,
 };

-use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer};
+use super::{
+    AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, PathOrConf,
+    PersistentLayerDesc,
+};

 ///
 /// Header stored in the beginning of the file
@@ -178,12 +182,20 @@ impl DeltaKey {
    }
 }

-/// This is used only from `pagectl`. Within pageserver, all layers are
-/// [`crate::tenant::storage_layer::Layer`], which can hold a [`DeltaLayerInner`].
+/// DeltaLayer is the in-memory data structure associated with an on-disk delta
+/// file.
+///
+/// We keep a DeltaLayer in memory for each file, in the LayerMap. If a layer
+/// is in "loaded" state, we have a copy of the index in memory, in 'inner'.
+/// Otherwise the struct is just a placeholder for a file that exists on disk,
+/// and it needs to be loaded before using it in queries.
 pub struct DeltaLayer {
-    path: Utf8PathBuf,
+    path_or_conf: PathOrConf,
+
    pub desc: PersistentLayerDesc,
+
    access_stats: LayerAccessStats,
+
    inner: OnceCell<Arc<DeltaLayerInner>>,
 }

@@ -200,8 +212,6 @@ impl std::fmt::Debug for DeltaLayer {
    }
 }

-/// `DeltaLayerInner` is the in-memory data structure associated with an on-disk delta
-/// file.
 pub struct DeltaLayerInner {
    // values copied from summary
    index_start_blk: u32,
@@ -211,6 +221,12 @@ pub struct DeltaLayerInner {
    file: FileBlockReader,
 }

+impl AsRef<DeltaLayerInner> for DeltaLayerInner {
+    fn as_ref(&self) -> &DeltaLayerInner {
+        self
+    }
+}
+
 impl std::fmt::Debug for DeltaLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("DeltaLayerInner")
@@ -220,6 +236,19 @@ impl std::fmt::Debug for DeltaLayerInner {
    }
 }

+#[async_trait::async_trait]
+impl Layer for DeltaLayer {
+    async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
+            .await
+    }
+}
 /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
 impl std::fmt::Display for DeltaLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -233,9 +262,40 @@ impl AsLayerDesc for DeltaLayer {
    }
 }

+impl PersistentLayer for DeltaLayer {
+    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
+        Some(self)
+    }
+
+    fn local_path(&self) -> Option<Utf8PathBuf> {
+        self.local_path()
+    }
+
+    fn delete_resident_layer_file(&self) -> Result<()> {
+        self.delete_resident_layer_file()
+    }
+
+    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        self.info(reset)
+    }
+
+    fn access_stats(&self) -> &LayerAccessStats {
+        self.access_stats()
+    }
+}
+
 impl DeltaLayer {
    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
-        self.desc.dump();
+        println!(
+            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
+            self.desc.tenant_id,
+            self.desc.timeline_id,
+            self.desc.key_range.start,
+            self.desc.key_range.end,
+            self.desc.lsn_range.start,
+            self.desc.lsn_range.end,
+            self.desc.file_size,
+        );

        if !verbose {
            return Ok(());
@@ -243,7 +303,119 @@ impl DeltaLayer {

        let inner = self.load(LayerAccessKind::Dump, ctx).await?;

-        inner.dump(ctx).await
+        println!(
+            "index_start_blk: {}, root {}",
+            inner.index_start_blk, inner.index_root_blk
+        );
+
+        let file = &inner.file;
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            inner.index_start_blk,
+            inner.index_root_blk,
+            file,
+        );
+
+        tree_reader.dump().await?;
+
+        let keys = DeltaLayerInner::load_keys(&inner, ctx).await?;
+
+        // A subroutine to dump a single blob
+        async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> Result<String> {
+            let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
+            let val = Value::des(&buf)?;
+            let desc = match val {
+                Value::Image(img) => {
+                    format!(" img {} bytes", img.len())
+                }
+                Value::WalRecord(rec) => {
+                    let wal_desc = walrecord::describe_wal_record(&rec)?;
+                    format!(
+                        " rec {} bytes will_init: {} {}",
+                        buf.len(),
+                        rec.will_init(),
+                        wal_desc
+                    )
+                }
+            };
+            Ok(desc)
+        }
+
+        for entry in keys {
+            let DeltaEntry { key, lsn, val, .. } = entry;
+            let desc = match dump_blob(val, ctx).await {
+                Ok(desc) => desc,
+                Err(err) => {
+                    let err: anyhow::Error = err;
+                    format!("ERROR: {err}")
+                }
+            };
+            println!("  key {key} at {lsn}: {desc}");
+        }
+
+        Ok(())
+    }
+
+    pub(crate) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        ensure!(lsn_range.start >= self.desc.lsn_range.start);
+
+        ensure!(self.desc.key_range.contains(&key));
+
+        let inner = self
+            .load(LayerAccessKind::GetValueReconstructData, ctx)
+            .await?;
+        inner
+            .get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
+            .await
+    }
+
+    pub(crate) fn local_path(&self) -> Option<Utf8PathBuf> {
+        Some(self.path())
+    }
+
+    pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
+        // delete underlying file
+        fs::remove_file(self.path())?;
+        Ok(())
+    }
+
+    pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        let layer_file_name = self.layer_desc().filename().file_name();
+        let lsn_range = self.layer_desc().lsn_range.clone();
+
+        let access_stats = self.access_stats.as_api_model(reset);
+
+        HistoricLayerInfo::Delta {
+            layer_file_name,
+            layer_file_size: self.desc.file_size,
+            lsn_start: lsn_range.start,
+            lsn_end: lsn_range.end,
+            remote: false,
+            access_stats,
+        }
+    }
+
+    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
+        &self.access_stats
+    }
+
+    fn path_for(
+        path_or_conf: &PathOrConf,
+        tenant_id: &TenantId,
+        timeline_id: &TimelineId,
+        fname: &DeltaFileName,
+    ) -> Utf8PathBuf {
+        match path_or_conf {
+            PathOrConf::Path(path) => path.clone(),
+            PathOrConf::Conf(conf) => conf
+                .timeline_path(tenant_id, timeline_id)
+                .join(fname.to_string()),
+        }
    }

    fn temp_path_for(
@@ -289,26 +461,58 @@ impl DeltaLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
        let path = self.path();

-        let loaded = DeltaLayerInner::load(&path, None, ctx).await?;
+        let summary = match &self.path_or_conf {
+            PathOrConf::Conf(_) => Some(Summary::from(self)),
+            PathOrConf::Path(_) => None,
+        };

-        // not production code
-        let actual_filename = path.file_name().unwrap().to_owned();
-        let expected_filename = self.layer_desc().filename().file_name();
+        let loaded = DeltaLayerInner::load(&path, summary, ctx).await?;

-        if actual_filename != expected_filename {
-            println!("warning: filename does not match what is expected from in-file summary");
-            println!("actual: {:?}", actual_filename);
-            println!("expected: {:?}", expected_filename);
+        if let PathOrConf::Path(ref path) = self.path_or_conf {
+            // not production code
+
+            let actual_filename = path.file_name().unwrap().to_owned();
+            let expected_filename = self.filename().file_name();
+
+            if actual_filename != expected_filename {
+                println!("warning: filename does not match what is expected from in-file summary");
+                println!("actual: {:?}", actual_filename);
+                println!("expected: {:?}", expected_filename);
+            }
        }

        Ok(Arc::new(loaded))
    }

+    /// Create a DeltaLayer struct representing an existing file on disk.
+    pub fn new(
+        conf: &'static PageServerConf,
+        timeline_id: TimelineId,
+        tenant_id: TenantId,
+        filename: &DeltaFileName,
+        file_size: u64,
+        access_stats: LayerAccessStats,
+    ) -> DeltaLayer {
+        DeltaLayer {
+            path_or_conf: PathOrConf::Conf(conf),
+            desc: PersistentLayerDesc::new_delta(
+                tenant_id,
+                timeline_id,
+                filename.key_range.clone(),
+                filename.lsn_range.clone(),
+                file_size,
+            ),
+            access_stats,
+            inner: OnceCell::new(),
+        }
+    }
+
    /// Create a DeltaLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
    pub fn new_for_path(path: &Utf8Path, file: File) -> Result<Self> {
-        let mut summary_buf = vec![0; PAGE_SZ];
+        let mut summary_buf = Vec::new();
+        summary_buf.resize(PAGE_SZ, 0);
        file.read_exact_at(&mut summary_buf, 0)?;
        let summary = Summary::des_prefix(&summary_buf)?;

@@ -317,7 +521,7 @@ impl DeltaLayer {
            .context("get file metadata to determine size")?;

        Ok(DeltaLayer {
-            path: path.to_path_buf(),
+            path_or_conf: PathOrConf::Path(path.to_path_buf()),
            desc: PersistentLayerDesc::new_delta(
                summary.tenant_id,
                summary.timeline_id,
@@ -330,9 +534,29 @@ impl DeltaLayer {
        })
    }

+    fn layer_name(&self) -> DeltaFileName {
+        self.desc.delta_file_name()
+    }
    /// Path to the layer file in pageserver workdir.
-    fn path(&self) -> Utf8PathBuf {
-        self.path.clone()
+    pub fn path(&self) -> Utf8PathBuf {
+        Self::path_for(
+            &self.path_or_conf,
+            &self.desc.tenant_id,
+            &self.desc.timeline_id,
+            &self.layer_name(),
+        )
+    }
+    /// Loads all keys stored in the layer. Returns key, lsn, value size and value reference.
+    ///
+    /// The value can be obtained via the [`ValueRef::load`] function.
+    pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<DeltaEntry<'_>>> {
+        let inner = self
+            .load(LayerAccessKind::KeyIter, ctx)
+            .await
+            .context("load delta layer keys")?;
+        DeltaLayerInner::load_keys(inner, ctx)
+            .await
+            .context("Layer index is corrupted")
    }
 }

@@ -437,7 +661,7 @@ impl DeltaLayerWriterInner {
    ///
    /// Finish writing the delta layer.
    ///
-    async fn finish(self, key_end: Key, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
+    async fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -494,21 +718,37 @@ impl DeltaLayerWriterInner {
        // Note: Because we opened the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
-
-        let desc = PersistentLayerDesc::new_delta(
-            self.tenant_id,
-            self.timeline_id,
-            self.key_start..key_end,
-            self.lsn_range.clone(),
-            metadata.len(),
-        );
+        let layer = DeltaLayer {
+            path_or_conf: PathOrConf::Conf(self.conf),
+            desc: PersistentLayerDesc::new_delta(
+                self.tenant_id,
+                self.timeline_id,
+                self.key_start..key_end,
+                self.lsn_range.clone(),
+                metadata.len(),
+            ),
+            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
+            inner: OnceCell::new(),
+        };

        // fsync the file
        file.sync_all().await?;
+        // Rename the file to its final name
+        //
+        // Note: This overwrites any existing file. There shouldn't be any.
+        // FIXME: throw an error instead?
+        let final_path = DeltaLayer::path_for(
+            &PathOrConf::Conf(self.conf),
+            &self.tenant_id,
+            &self.timeline_id,
+            &DeltaFileName {
+                key_range: self.key_start..key_end,
+                lsn_range: self.lsn_range,
+            },
+        );
+        std::fs::rename(self.path, &final_path)?;

-        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
-
-        trace!("created delta layer {}", layer.local_path());
+        trace!("created delta layer {final_path}");

        Ok(layer)
    }
@@ -589,12 +829,8 @@ impl DeltaLayerWriter {
    ///
    /// Finish writing the delta layer.
    ///
-    pub(crate) async fn finish(
-        mut self,
-        key_end: Key,
-        timeline: &Arc<Timeline>,
-    ) -> anyhow::Result<ResidentLayer> {
-        self.inner.take().unwrap().finish(key_end, timeline).await
+    pub async fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
+        self.inner.take().unwrap().finish(key_end).await
    }
 }

@@ -628,11 +864,11 @@ impl DeltaLayerInner {
            expected_summary.index_start_blk = actual_summary.index_start_blk;
            expected_summary.index_root_blk = actual_summary.index_root_blk;
            if actual_summary != expected_summary {
-                bail!(
-                    "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
-                    actual_summary,
-                    expected_summary
-                );
+                // bail!(
+                //     "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
+                //     actual_summary,
+                //     expected_summary
+                // );
            }
        }

@@ -732,17 +968,15 @@ impl DeltaLayerInner {
        }
    }

-    pub(super) async fn load_keys<'a>(
-        &'a self,
-        ctx: &RequestContext,
+    pub(super) async fn load_keys<'a, 'b, T: AsRef<DeltaLayerInner> + Clone>(
+        this: &'a T,
+        ctx: &'b RequestContext,
    ) -> Result<Vec<DeltaEntry<'a>>> {
-        let file = &self.file;
+        let dl = this.as_ref();
+        let file = &dl.file;

-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            file,
-        );
+        let tree_reader =
+            DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);

        let mut all_keys: Vec<DeltaEntry<'_>> = Vec::new();

@@ -755,7 +989,7 @@ impl DeltaLayerInner {
                    let val_ref = ValueRef {
                        blob_ref: BlobRef(value),
                        reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
-                            Adapter(self),
+                            Adapter(dl),
                        )),
                    };
                    let pos = BlobRef(value).pos();
@@ -782,61 +1016,10 @@ impl DeltaLayerInner {
        if let Some(last) = all_keys.last_mut() {
            // Last key occupies all space till end of value storage,
            // which corresponds to beginning of the index
-            last.size = self.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
+            last.size = dl.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
        }
        Ok(all_keys)
    }
-
-    pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
-        println!(
-            "index_start_blk: {}, root {}",
-            self.index_start_blk, self.index_root_blk
-        );
-
-        let file = &self.file;
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            file,
-        );
-
-        tree_reader.dump().await?;
-
-        let keys = self.load_keys(ctx).await?;
-
-        async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
-            let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
-            let val = Value::des(&buf)?;
-            let desc = match val {
-                Value::Image(img) => {
-                    format!(" img {} bytes", img.len())
-                }
-                Value::WalRecord(rec) => {
-                    let wal_desc = walrecord::describe_wal_record(&rec)?;
-                    format!(
-                        " rec {} bytes will_init: {} {}",
-                        buf.len(),
-                        rec.will_init(),
-                        wal_desc
-                    )
-                }
-            };
-            Ok(desc)
-        }
-
-        for entry in keys {
-            let DeltaEntry { key, lsn, val, .. } = entry;
-            let desc = match dump_blob(val, ctx).await {
-                Ok(desc) => desc,
-                Err(err) => {
-                    format!("ERROR: {err}")
-                }
-            };
-            println!("  key {key} at {lsn}: {desc}");
-        }
-
-        Ok(())
-    }
 }

 /// A set of data associated with a delta layer key and its value
@@ -876,9 +1059,3 @@ impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
        self.0.as_ref().file.read_blk(blknum, ctx).await
    }
 }
-
-impl AsRef<DeltaLayerInner> for DeltaLayerInner {
-    fn as_ref(&self) -> &DeltaLayerInner {
-        self
-    }
-}
--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
@@ -226,14 +226,6 @@ impl LayerFileName {
            _ => false,
        }
    }
-
-    pub(crate) fn kind(&self) -> &'static str {
-        use LayerFileName::*;
-        match self {
-            Delta(_) => "delta",
-            Image(_) => "image",
-        }
-    }
 }

 impl fmt::Display for LayerFileName {
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -31,23 +31,21 @@ use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
-    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
+    LayerAccessStats, PersistentLayer, ValueReconstructResult, ValueReconstructState,
 };
-use crate::tenant::Timeline;
 use crate::virtual_file::VirtualFile;
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
-use pageserver_api::models::LayerAccessKind;
+use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs::File;
+use std::fs::{self, File};
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
-use std::sync::Arc;
 use tokio::sync::OnceCell;
 use tracing::*;

@@ -58,7 +56,7 @@ use utils::{
 };

 use super::filename::ImageFileName;
-use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer};
+use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLayerDesc};

 ///
 /// Header stored in the beginning of the file
@@ -116,14 +114,22 @@ impl Summary {
    }
 }

-/// This is used only from `pagectl`. Within pageserver, all layers are
-/// [`crate::tenant::storage_layer::Layer`], which can hold an [`ImageLayerInner`].
+/// ImageLayer is the in-memory data structure associated with an on-disk image
+/// file.
+///
+/// We keep an ImageLayer in memory for each file, in the LayerMap. If a layer
+/// is in "loaded" state, we have a copy of the index in memory, in 'inner'.
+/// Otherwise the struct is just a placeholder for a file that exists on disk,
+/// and it needs to be loaded before using it in queries.
 pub struct ImageLayer {
-    path: Utf8PathBuf,
+    path_or_conf: PathOrConf,
+
    pub desc: PersistentLayerDesc,
    // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
    pub lsn: Lsn,
+
    access_stats: LayerAccessStats,
+
    inner: OnceCell<ImageLayerInner>,
 }

@@ -140,8 +146,6 @@ impl std::fmt::Debug for ImageLayer {
    }
 }

-/// ImageLayer is the in-memory data structure associated with an on-disk image
-/// file.
 pub struct ImageLayerInner {
    // values copied from summary
    index_start_blk: u32,
@@ -162,11 +166,73 @@ impl std::fmt::Debug for ImageLayerInner {
    }
 }

-impl ImageLayerInner {
-    pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let file = &self.file;
+#[async_trait::async_trait]
+impl Layer for ImageLayer {
+    /// Look up given page in the file
+    async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
+            .await
+    }
+}
+
+/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+impl std::fmt::Display for ImageLayer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.layer_desc().short_id())
+    }
+}
+
+impl AsLayerDesc for ImageLayer {
+    fn layer_desc(&self) -> &PersistentLayerDesc {
+        &self.desc
+    }
+}
+
+impl PersistentLayer for ImageLayer {
+    fn local_path(&self) -> Option<Utf8PathBuf> {
+        self.local_path()
+    }
+
+    fn delete_resident_layer_file(&self) -> Result<()> {
+        self.delete_resident_layer_file()
+    }
+
+    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        self.info(reset)
+    }
+
+    fn access_stats(&self) -> &LayerAccessStats {
+        self.access_stats()
+    }
+}
+
+impl ImageLayer {
+    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+        println!(
+            "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
+            self.desc.tenant_id,
+            self.desc.timeline_id,
+            self.desc.key_range.start,
+            self.desc.key_range.end,
+            self.lsn,
+            self.desc.is_incremental(),
+            self.desc.file_size
+        );
+
+        if !verbose {
+            return Ok(());
+        }
+
+        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
+        let file = &inner.file;
        let tree_reader =
-            DiskBtreeReader::<_, KEY_SIZE>::new(self.index_start_blk, self.index_root_blk, file);
+            DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);

        tree_reader.dump().await?;

@@ -184,36 +250,69 @@ impl ImageLayerInner {

        Ok(())
    }
-}

-/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-impl std::fmt::Display for ImageLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.layer_desc().short_id())
+    pub(crate) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        assert!(self.desc.key_range.contains(&key));
+        assert!(lsn_range.start >= self.lsn);
+        assert!(lsn_range.end >= self.lsn);
+
+        let inner = self
+            .load(LayerAccessKind::GetValueReconstructData, ctx)
+            .await?;
+        inner
+            .get_value_reconstruct_data(key, reconstruct_state, ctx)
+            .await
+            // FIXME: makes no sense to dump paths
+            .with_context(|| format!("read {}", self.path()))
    }
-}

-impl AsLayerDesc for ImageLayer {
-    fn layer_desc(&self) -> &PersistentLayerDesc {
-        &self.desc
+    pub(crate) fn local_path(&self) -> Option<Utf8PathBuf> {
+        Some(self.path())
    }
-}
-
-impl ImageLayer {
-    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
-        self.desc.dump();
-
-        if !verbose {
-            return Ok(());
-        }
-
-        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
-
-        inner.dump(ctx).await?;

+    pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
+        // delete underlying file
+        fs::remove_file(self.path())?;
        Ok(())
    }

+    pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        let layer_file_name = self.layer_desc().filename().file_name();
+        let lsn_start = self.layer_desc().image_layer_lsn();
+
+        HistoricLayerInfo::Image {
+            layer_file_name,
+            layer_file_size: self.desc.file_size,
+            lsn_start,
+            remote: false,
+            access_stats: self.access_stats.as_api_model(reset),
+        }
+    }
+
+    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
+        &self.access_stats
+    }
+
+    fn path_for(
+        path_or_conf: &PathOrConf,
+        timeline_id: TimelineId,
+        tenant_id: TenantId,
+        fname: &ImageFileName,
+    ) -> Utf8PathBuf {
+        match path_or_conf {
+            PathOrConf::Path(path) => path.to_path_buf(),
+            PathOrConf::Conf(conf) => conf
+                .timeline_path(&tenant_id, &timeline_id)
+                .join(fname.to_string()),
+        }
+    }
+
    fn temp_path_for(
        conf: &PageServerConf,
        timeline_id: TimelineId,
@@ -249,33 +348,67 @@ impl ImageLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
        let path = self.path();

-        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx).await?;
+        let expected_summary = match &self.path_or_conf {
+            PathOrConf::Conf(_) => Some(Summary::from(self)),
+            PathOrConf::Path(_) => None,
+        };

-        // not production code
-        let actual_filename = path.file_name().unwrap().to_owned();
-        let expected_filename = self.layer_desc().filename().file_name();
+        let loaded =
+            ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary, ctx)
+                .await?;

-        if actual_filename != expected_filename {
-            println!("warning: filename does not match what is expected from in-file summary");
-            println!("actual: {:?}", actual_filename);
-            println!("expected: {:?}", expected_filename);
+        if let PathOrConf::Path(ref path) = self.path_or_conf {
+            // not production code
+            let actual_filename = path.file_name().unwrap().to_owned();
+            let expected_filename = self.filename().file_name();
+
+            if actual_filename != expected_filename {
+                println!("warning: filename does not match what is expected from in-file summary");
+                println!("actual: {:?}", actual_filename);
+                println!("expected: {:?}", expected_filename);
+            }
        }

        Ok(loaded)
    }

+    /// Create an ImageLayer struct representing an existing file on disk
+    pub fn new(
+        conf: &'static PageServerConf,
+        timeline_id: TimelineId,
+        tenant_id: TenantId,
+        filename: &ImageFileName,
+        file_size: u64,
+        access_stats: LayerAccessStats,
+    ) -> ImageLayer {
+        ImageLayer {
+            path_or_conf: PathOrConf::Conf(conf),
+            desc: PersistentLayerDesc::new_img(
+                tenant_id,
+                timeline_id,
+                filename.key_range.clone(),
+                filename.lsn,
+                file_size,
+            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
+            lsn: filename.lsn,
+            access_stats,
+            inner: OnceCell::new(),
+        }
+    }
+
    /// Create an ImageLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
    pub fn new_for_path(path: &Utf8Path, file: File) -> Result<ImageLayer> {
-        let mut summary_buf = vec![0; PAGE_SZ];
+        let mut summary_buf = Vec::new();
+        summary_buf.resize(PAGE_SZ, 0);
        file.read_exact_at(&mut summary_buf, 0)?;
        let summary = Summary::des_prefix(&summary_buf)?;
        let metadata = file
            .metadata()
            .context("get file metadata to determine size")?;
        Ok(ImageLayer {
-            path: path.to_path_buf(),
+            path_or_conf: PathOrConf::Path(path.to_path_buf()),
            desc: PersistentLayerDesc::new_img(
                summary.tenant_id,
                summary.timeline_id,
@@ -289,8 +422,18 @@ impl ImageLayer {
        })
    }

-    fn path(&self) -> Utf8PathBuf {
-        self.path.clone()
+    fn layer_name(&self) -> ImageFileName {
+        self.desc.image_file_name()
+    }
+
+    /// Path to the layer file in pageserver workdir.
+    pub fn path(&self) -> Utf8PathBuf {
+        Self::path_for(
+            &self.path_or_conf,
+            self.desc.timeline_id,
+            self.desc.tenant_id,
+            &self.layer_name(),
+        )
    }
 }

@@ -314,11 +457,11 @@ impl ImageLayerInner {
            expected_summary.index_root_blk = actual_summary.index_root_blk;

            if actual_summary != expected_summary {
-                bail!(
-                    "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
-                    actual_summary,
-                    expected_summary
-                );
+                // bail!(
+                //     "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
+                //     actual_summary,
+                //     expected_summary
+                // );
            }
        }

@@ -462,7 +605,7 @@ impl ImageLayerWriterInner {
    ///
    /// Finish writing the image layer.
    ///
-    async fn finish(self, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
+    async fn finish(self) -> anyhow::Result<ImageLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -516,14 +659,33 @@ impl ImageLayerWriterInner {
        // Note: Because we open the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
+        let layer = ImageLayer {
+            path_or_conf: PathOrConf::Conf(self.conf),
+            desc,
+            lsn: self.lsn,
+            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
+            inner: OnceCell::new(),
+        };

        // fsync the file
        file.sync_all().await?;

-        // FIXME: why not carry the virtualfile here, it supports renaming?
-        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
+        // Rename the file to its final name
+        //
+        // Note: This overwrites any existing file. There shouldn't be any.
+        // FIXME: throw an error instead?
+        let final_path = ImageLayer::path_for(
+            &PathOrConf::Conf(self.conf),
+            self.timeline_id,
+            self.tenant_id,
+            &ImageFileName {
+                key_range: self.key_range.clone(),
+                lsn: self.lsn,
+            },
+        );
+        std::fs::rename(self.path, final_path)?;

-        trace!("created image layer {}", layer.local_path());
+        trace!("created image layer {}", layer.path());

        Ok(layer)
    }
@@ -585,11 +747,8 @@ impl ImageLayerWriter {
    ///
    /// Finish writing the image layer.
    ///
-    pub(crate) async fn finish(
-        mut self,
-        timeline: &Arc<Timeline>,
-    ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner.take().unwrap().finish(timeline).await
+    pub async fn finish(mut self) -> anyhow::Result<ImageLayer> {
+        self.inner.take().unwrap().finish().await
    }
 }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -10,12 +10,11 @@ use crate::repository::{Key, Value};
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
-use crate::tenant::Timeline;
 use crate::walrecord;
 use anyhow::{ensure, Result};
 use pageserver_api::models::InMemoryLayerInfo;
 use std::collections::HashMap;
-use std::sync::{Arc, OnceLock};
+use std::sync::OnceLock;
 use tracing::*;
 use utils::{
    bin_ser::BeSer,
@@ -29,7 +28,7 @@ use std::fmt::Write as _;
 use std::ops::Range;
 use tokio::sync::RwLock;

-use super::{DeltaLayerWriter, ResidentLayer};
+use super::{DeltaLayer, DeltaLayerWriter, Layer};

 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
@@ -208,6 +207,20 @@ impl InMemoryLayer {
    }
 }

+#[async_trait::async_trait]
+impl Layer for InMemoryLayer {
+    async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_data: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<ValueReconstructResult> {
+        self.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
+            .await
+    }
+}
+
 impl std::fmt::Display for InMemoryLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let end_lsn = self.end_lsn_or_max();
@@ -216,13 +229,17 @@ impl std::fmt::Display for InMemoryLayer {
 }

 impl InMemoryLayer {
+    ///
    /// Get layer size.
+    ///
    pub async fn size(&self) -> Result<u64> {
        let inner = self.inner.read().await;
        Ok(inner.file.len())
    }

+    ///
    /// Create a new, empty, in-memory layer
+    ///
    pub async fn create(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
@@ -314,11 +331,7 @@ impl InMemoryLayer {
    /// Write this frozen in-memory layer to disk.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
-    pub(crate) async fn write_to_disk(
-        &self,
-        timeline: &Arc<Timeline>,
-        ctx: &RequestContext,
-    ) -> Result<ResidentLayer> {
+    pub(crate) async fn write_to_disk(&self, ctx: &RequestContext) -> Result<DeltaLayer> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -363,8 +376,7 @@ impl InMemoryLayer {
            }
        }

-        // MAX is used here because we identify L0 layers by full key range
-        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
+        let delta_layer = delta_layer_writer.finish(Key::MAX).await?;
        Ok(delta_layer)
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -1,3 +1,4 @@
+use anyhow::Result;
 use core::fmt::Display;
 use std::ops::Range;
 use utils::{
@@ -5,7 +6,7 @@ use utils::{
    lsn::Lsn,
 };

-use crate::repository::Key;
+use crate::{context::RequestContext, repository::Key};

 use super::{DeltaFileName, ImageFileName, LayerFileName};

@@ -99,22 +100,6 @@ impl PersistentLayerDesc {
        }
    }

-    pub fn from_filename(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        filename: LayerFileName,
-        file_size: u64,
-    ) -> Self {
-        match filename {
-            LayerFileName::Image(i) => {
-                Self::new_img(tenant_id, timeline_id, i.key_range, i.lsn, file_size)
-            }
-            LayerFileName::Delta(d) => {
-                Self::new_delta(tenant_id, timeline_id, d.key_range, d.lsn_range, file_size)
-            }
-        }
-    }
-
    /// Get the LSN that the image layer covers.
    pub fn image_layer_lsn(&self) -> Lsn {
        assert!(!self.is_delta);
@@ -188,31 +173,21 @@ impl PersistentLayerDesc {
        self.is_delta
    }

-    pub fn dump(&self) {
-        if self.is_delta {
-            println!(
-                "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} is_incremental {} size {} ----",
-                self.tenant_id,
-                self.timeline_id,
-                self.key_range.start,
-                self.key_range.end,
-                self.lsn_range.start,
-                self.lsn_range.end,
-                self.is_incremental(),
-                self.file_size,
-            );
-        } else {
-            println!(
-                "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
-                self.tenant_id,
-                self.timeline_id,
-                self.key_range.start,
-                self.key_range.end,
-                self.image_layer_lsn(),
-                self.is_incremental(),
-                self.file_size
-            );
-        }
+    pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
+        println!(
+            "----- layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
+            self.tenant_id,
+            self.timeline_id,
+            self.key_range.start,
+            self.key_range.end,
+            self.lsn_range.start,
+            self.lsn_range.end,
+            self.is_delta,
+            self.is_incremental(),
+            self.file_size,
+        );
+
+        Ok(())
    }

    pub fn file_size(&self) -> u64 {
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -0,0 +1,216 @@
+//! A RemoteLayer is an in-memory placeholder for a layer file that exists
+//! in remote storage.
+//!
+use crate::config::PageServerConf;
+use crate::context::RequestContext;
+use crate::repository::Key;
+use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
+use crate::tenant::timeline::layer_manager::LayerManager;
+use anyhow::{bail, Result};
+use camino::Utf8PathBuf;
+use pageserver_api::models::HistoricLayerInfo;
+use std::ops::Range;
+use std::sync::Arc;
+
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+use super::filename::{DeltaFileName, ImageFileName};
+use super::{
+    AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset,
+    LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
+};
+
+/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
+/// [`DeltaLayer`](super::DeltaLayer).
+///
+/// RemoteLayer might be downloaded on-demand during operations which are
+/// allowed download remote layers and during which, it gets replaced with a
+/// concrete `DeltaLayer` or `ImageLayer`.
+///
+/// See: [`crate::context::RequestContext`] for authorization to download
+pub struct RemoteLayer {
+    pub desc: PersistentLayerDesc,
+
+    pub layer_metadata: LayerFileMetadata,
+
+    access_stats: LayerAccessStats,
+
+    pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
+
+    /// Has `LayerMap::replace` failed for this (true) or not (false).
+    ///
+    /// Used together with [`ongoing_download`] semaphore in `Timeline::download_remote_layer`.
+    /// The field is used to mark a RemoteLayer permanently (until restart or ignore+load)
+    /// unprocessable, because a LayerMap::replace failed.
+    ///
+    /// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
+    /// a possible fast loop between `Timeline::get_reconstruct_data` and
+    /// `Timeline::download_remote_layer`, which also logs.
+    ///
+    /// [`ongoing_download`]: Self::ongoing_download
+    pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
+}
+
+impl std::fmt::Debug for RemoteLayer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("RemoteLayer")
+            .field("file_name", &self.desc.filename())
+            .field("layer_metadata", &self.layer_metadata)
+            .field("is_incremental", &self.desc.is_incremental())
+            .finish()
+    }
+}
+
+#[async_trait::async_trait]
+impl Layer for RemoteLayer {
+    async fn get_value_reconstruct_data(
+        &self,
+        _key: Key,
+        _lsn_range: Range<Lsn>,
+        _reconstruct_state: &mut ValueReconstructState,
+        _ctx: &RequestContext,
+    ) -> Result<ValueReconstructResult> {
+        Err(anyhow::anyhow!("layer {self} needs to be downloaded"))
+    }
+}
+
+/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+impl std::fmt::Display for RemoteLayer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.layer_desc().short_id())
+    }
+}
+
+impl AsLayerDesc for RemoteLayer {
+    fn layer_desc(&self) -> &PersistentLayerDesc {
+        &self.desc
+    }
+}
+
+impl PersistentLayer for RemoteLayer {
+    fn local_path(&self) -> Option<Utf8PathBuf> {
+        None
+    }
+
+    fn delete_resident_layer_file(&self) -> Result<()> {
+        bail!("remote layer has no layer file");
+    }
+
+    fn downcast_remote_layer<'a>(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
+        Some(self)
+    }
+
+    fn is_remote_layer(&self) -> bool {
+        true
+    }
+
+    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        let layer_file_name = self.layer_desc().filename().file_name();
+        let lsn_range = self.layer_desc().lsn_range.clone();
+
+        if self.desc.is_delta {
+            HistoricLayerInfo::Delta {
+                layer_file_name,
+                layer_file_size: self.layer_metadata.file_size(),
+                lsn_start: lsn_range.start,
+                lsn_end: lsn_range.end,
+                remote: true,
+                access_stats: self.access_stats.as_api_model(reset),
+            }
+        } else {
+            HistoricLayerInfo::Image {
+                layer_file_name,
+                layer_file_size: self.layer_metadata.file_size(),
+                lsn_start: lsn_range.start,
+                remote: true,
+                access_stats: self.access_stats.as_api_model(reset),
+            }
+        }
+    }
+
+    fn access_stats(&self) -> &LayerAccessStats {
+        &self.access_stats
+    }
+}
+
+impl RemoteLayer {
+    pub fn new_img(
+        tenantid: TenantId,
+        timelineid: TimelineId,
+        fname: &ImageFileName,
+        layer_metadata: &LayerFileMetadata,
+        access_stats: LayerAccessStats,
+    ) -> RemoteLayer {
+        RemoteLayer {
+            desc: PersistentLayerDesc::new_img(
+                tenantid,
+                timelineid,
+                fname.key_range.clone(),
+                fname.lsn,
+                layer_metadata.file_size(),
+            ),
+            layer_metadata: layer_metadata.clone(),
+            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
+            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
+            access_stats,
+        }
+    }
+
+    pub fn new_delta(
+        tenantid: TenantId,
+        timelineid: TimelineId,
+        fname: &DeltaFileName,
+        layer_metadata: &LayerFileMetadata,
+        access_stats: LayerAccessStats,
+    ) -> RemoteLayer {
+        RemoteLayer {
+            desc: PersistentLayerDesc::new_delta(
+                tenantid,
+                timelineid,
+                fname.key_range.clone(),
+                fname.lsn_range.clone(),
+                layer_metadata.file_size(),
+            ),
+            layer_metadata: layer_metadata.clone(),
+            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
+            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
+            access_stats,
+        }
+    }
+
+    /// Create a Layer struct representing this layer, after it has been downloaded.
+    pub(crate) fn create_downloaded_layer(
+        &self,
+        _layer_map_lock_held_witness: &LayerManager,
+        conf: &'static PageServerConf,
+        file_size: u64,
+    ) -> Arc<dyn PersistentLayer> {
+        if self.desc.is_delta {
+            let fname = self.desc.delta_file_name();
+            Arc::new(DeltaLayer::new(
+                conf,
+                self.desc.timeline_id,
+                self.desc.tenant_id,
+                &fname,
+                file_size,
+                self.access_stats
+                    .clone_for_residence_change(LayerResidenceStatus::Resident),
+            ))
+        } else {
+            let fname = self.desc.image_file_name();
+            Arc::new(ImageLayer::new(
+                conf,
+                self.desc.timeline_id,
+                self.desc.tenant_id,
+                &fname,
+                file_size,
+                self.access_stats
+                    .clone_for_residence_change(LayerResidenceStatus::Resident),
+            ))
+        }
+    }
+}
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -159,8 +159,11 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {

            // TODO: we shouldn't need to await to find tenant and this could be moved outside of
            // loop, #3501. There are also additional "allowed_errors" in tests.
-            if first && random_init_delay(period, &cancel).await.is_err() {
-                break;
+            if first {
+                first = false;
+                if random_init_delay(period, &cancel).await.is_err() {
+                    break;
+                }
            }

            let started_at = Instant::now();
@@ -180,16 +183,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }
            };

-            if !first {
-                // The first iteration is typically much slower, because all tenants compete for the
-                // compaction sempahore to run, and because of concurrent startup work like initializing
-                // logical sizes.  To avoid routinely spamming warnings, we suppress this log on first iteration.
-                warn_when_period_overrun(
-                    started_at.elapsed(),
-                    period,
-                    BackgroundLoopKind::Compaction,
-                );
-            }
+            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);

            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -198,8 +192,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            {
                break;
            }
-
-            first = false;
        }
    }
    .await;
@@ -231,8 +223,11 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {

            let period = tenant.get_gc_period();

-            if first && random_init_delay(period, &cancel).await.is_err() {
-                break;
+            if first {
+                first = false;
+                if random_init_delay(period, &cancel).await.is_err() {
+                    break;
+                }
            }

            let started_at = Instant::now();
@@ -256,12 +251,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }
            };

-            if !first {
-                // The first iteration is typically much slower, because all tenants compete for the
-                // compaction sempahore to run, and because of concurrent startup work like initializing
-                // logical sizes.  To avoid routinely spamming warnings, we suppress this log on first iteration.
-                warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
-            }
+            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);

            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -270,8 +260,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            {
                break;
            }
-
-            first = false;
        }
    }
    .await;
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -29,6 +29,7 @@ use crate::{
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
+        storage_layer::PersistentLayer,
        tasks::{BackgroundLoopKind, RateLimitError},
        timeline::EvictionError,
        LogicalSizeCalculationCause, Tenant,
@@ -209,26 +210,15 @@ impl Timeline {
        // NB: all the checks can be invalidated as soon as we release the layer map lock.
        // We don't want to hold the layer map lock during eviction.
        // So, we just need to deal with this.
-        let candidates: Vec<_> = {
+        let candidates: Vec<Arc<dyn PersistentLayer>> = {
            let guard = self.layers.read().await;
            let layers = guard.layer_map();
            let mut candidates = Vec::new();
            for hist_layer in layers.iter_historic_layers() {
                let hist_layer = guard.get_from_desc(&hist_layer);
-
-                // guard against eviction while we inspect it; it might be that eviction_task and
-                // disk_usage_eviction_task both select the same layers to be evicted, and
-                // seemingly free up double the space. both succeeding is of no consequence.
-                let guard = match hist_layer.keep_resident().await {
-                    Ok(Some(l)) => l,
-                    Ok(None) => continue,
-                    Err(e) => {
-                        // these should not happen, but we cannot make them statically impossible right
-                        // now.
-                        tracing::warn!(layer=%hist_layer, "failed to keep the layer resident: {e:#}");
-                        continue;
-                    }
-                };
+                if hist_layer.is_remote_layer() {
+                    continue;
+                }

                let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| {
                    // We only use this fallback if there's an implementation error.
@@ -259,7 +249,7 @@ impl Timeline {
                    }
                };
                if no_activity_for > p.threshold {
-                    candidates.push(guard.drop_eviction_guard())
+                    candidates.push(hist_layer)
                }
            }
            candidates
@@ -278,7 +268,7 @@ impl Timeline {
        };

        let results = match self
-            .evict_layer_batch(remote_client, &candidates, cancel)
+            .evict_layer_batch(remote_client, &candidates[..], cancel.clone())
            .await
        {
            Err(pre_err) => {
@@ -289,7 +279,7 @@ impl Timeline {
            Ok(results) => results,
        };
        assert_eq!(results.len(), candidates.len());
-        for result in results {
+        for (l, result) in candidates.iter().zip(results) {
            match result {
                None => {
                    stats.skipped_for_shutdown += 1;
@@ -297,10 +287,20 @@ impl Timeline {
                Some(Ok(())) => {
                    stats.evicted += 1;
                }
-                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
+                Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
+                    stats.not_evictable += 1;
+                }
+                Some(Err(EvictionError::FileNotFound)) => {
                    // compaction/gc removed the file while we were waiting on layer_removal_cs
                    stats.not_evictable += 1;
                }
+                Some(Err(
+                    e @ EvictionError::LayerNotFound(_) | e @ EvictionError::StatFailed(_),
+                )) => {
+                    let e = utils::error::report_compact_sources(&e);
+                    warn!(layer = %l, "failed to evict layer: {e}");
+                    stats.not_evictable += 1;
+                }
            }
        }
        if stats.candidates == stats.not_evictable {
@@ -344,7 +344,20 @@ impl Timeline {
        // Make one of the tenant's timelines draw the short straw and run the calculation.
        // The others wait until the calculation is done so that they take into account the
        // imitated accesses that the winner made.
-        let tenant = match crate::tenant::mgr::get_tenant(self.tenant_id, true) {
+        //
+        // It is critical we are responsive to cancellation here. Otherwise, we deadlock with
+        // tenant deletion (holds TENANTS in read mode) any other task that attempts to
+        // acquire TENANTS in write mode before we here call get_tenant.
+        // See https://github.com/neondatabase/neon/issues/5284.
+        let res = tokio::select! {
+            _ = cancel.cancelled() => {
+                return ControlFlow::Break(());
+            }
+            res = crate::tenant::mgr::get_tenant(self.tenant_id, true) => {
+                res
+            }
+        };
+        let tenant = match res {
            Ok(t) => t,
            Err(_) => {
                return ControlFlow::Break(());
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Christian Schwarz	4b7fddeabe	backup download_all_layers.py script	2023-10-26 08:14:08 +00:00
Christian Schwarz	68386c19a2	backing up the exact tokio-epoll-uring version used in the earlier (since reverted) integration commit commit dde7c280e77dbb867d2fd459d629da2fd7b0edc6 (HEAD -> problame/wip-2023-10-17, origin/problame/wip-2023-10-17) Author: Christian Schwarz <me@cschwarz.com> Date: Tue Oct 17 10:09:48 2023 +0000 no info! logging (not sure this matters, tracing showed up in perf when integrating this branch into neon.git) The integration commit in this branch was: commit `61fac1ab0b` Author: Christian Schwarz <me@cschwarz.com> Date: Tue Aug 29 19:13:38 2023 +0000 CP: use hacked-together open_at for async VirtualFile open calls instead of spawn_blocking	2023-10-17 10:12:22 +00:00
Christian Schwarz	db787dd6e0	backing up pageserver.toml used in experiments remote_storage ={local_path='/home/admin/neon-main/test_output/test_pageserver_startup_many_tenants/repo/local_fs_remote_storage/pageserver'} id =1 pg_distrib_dir ='/home/admin/neon-main/pg_install' http_auth_type ='Trust' pg_auth_type ='Trust' listen_http_addr ='localhost:15003' listen_pg_addr ='localhost:15002' broker_endpoint ='http://127.0.0.1:15001/' # 2023-10-10 17:46 UTC #page_cache_size = 16384 #max_file_descriptors = 2000 # 2023-10-10 18:10 UTC page_cache_size = 819200 max_file_descriptors = 100000 # Initial configuration file created by 'pageserver --init' #listen_pg_addr = '127.0.0.1:64000' #listen_http_addr = '127.0.0.1:9898' #wait_lsn_timeout = '60 s' #wal_redo_timeout = '60 s' #max_file_descriptors = 100 # initial superuser role name to use when creating a new tenant #initial_superuser_name = 'cloud_admin' #broker_endpoint = 'http://127.0.0.1:50051' #log_format = 'plain' #concurrent_tenant_size_logical_size_queries = '1' #metric_collection_interval = '10 min' #cached_metric_collection_interval = '0s' #synthetic_size_calculation_interval = '10 min' #disk_usage_based_eviction = { max_usage_pct = .., min_avail_bytes = .., period = "10s"} #background_task_maximum_delay = '10s' metric_collection_endpoint = "https://localtest.me:23423" metric_collection_interval = "10min" cached_metric_collection_interval = "0s" [tenant_config] eviction_policy = { kind = "LayerAccessThreshold" , period = "10m", threshold = "7d" }	2023-10-10 18:16:58 +00:00
Christian Schwarz	6c5e8c6bb6	backing up useful prometheus queries: http://localhost:9090/graph?g0.expr=rate(pageserver_getpage_reconstruct_seconds_count%5B20s%5D)&g0.tab=0&g0.stacked=0&g0.show_exemplars=0&g0.range_input=1h&g1.expr=sum(rate(pageserver_page_cache_find_victim_iters_total%5B20s%5D))%0A%2F%0Asum(rate(pageserver_getpage_reconstruct_seconds_count%5B20s%5D))&g1.tab=0&g1.stacked=0&g1.show_exemplars=0&g1.range_input=6h&g2.expr=pageserver_storage_operations_seconds_global_count%7Boperation!%3D%22load%20layer%20map%22%7D&g2.tab=0&g2.stacked=0&g2.show_exemplars=0&g2.range_input=30m&g3.expr=sum(rate(pageserver_background_loop_period_overrun_count%5B20s%5D))%20by%20(task%2Cperiod)&g3.tab=0&g3.stacked=0&g3.show_exemplars=0&g3.range_input=1h&g4.expr=pageserver_background_loop_semaphore_wait_start_count%0A-%0Apageserver_background_loop_semaphore_wait_finish_count&g4.tab=0&g4.stacked=0&g4.show_exemplars=0&g4.range_input=1h	2023-10-10 17:55:54 +00:00
Christian Schwarz	c5259dcf32	WIP++ v2 limit eviction task concurrency: metric & enum	2023-10-10 17:52:32 +00:00
Christian Schwarz	112008519c	HACK: BACKGROUND_RUNTIME webserver to measure response time using `wrk`	2023-10-10 13:37:16 +00:00
Christian Schwarz	5917a54719	Revert "WIP: tracing-flame support" This reverts commit `dbe3290f89`.	2023-10-10 13:35:55 +00:00
Christian Schwarz	dbe3290f89	WIP: tracing-flame support	2023-10-10 12:17:55 +00:00
Christian Schwarz	bfcde8f9e6	WIP v2 limit eviction task concurrency This reverts commit `55106aa981`.	2023-10-10 12:17:55 +00:00
Christian Schwarz	dbb8377983	Revert "CP tokio_epoll_uring for read path" This reverts commit `1556234d9a`.	2023-10-10 12:17:55 +00:00
Christian Schwarz	d91539b888	Revert "CP: use hacked-together open_at for async VirtualFile open calls instead of spawn_blocking" This reverts commit `61fac1ab0b`.	2023-10-10 12:17:41 +00:00
Christian Schwarz	61fac1ab0b	CP: use hacked-together open_at for async VirtualFile open calls instead of spawn_blocking This makes Delta/Image ::load fns fully tokio-epoll-uring	2023-10-10 11:56:31 +00:00
Christian Schwarz	8d3e8078f7	comment out any spans in page cache	2023-10-10 11:56:31 +00:00
Christian Schwarz	373fa7c2ac	origin/problame/page-cache-forward-progress/3: trace spans and events only for tests	2023-10-09 20:21:22 +00:00
Christian Schwarz	1556234d9a	CP tokio_epoll_uring for read path	2023-10-09 20:20:59 +00:00
Christian Schwarz	55106aa981	Revert "WIP limit eviction task concurrency" This reverts commit `64680b1373`.	2023-10-09 19:47:17 +00:00
Christian Schwarz	64680b1373	WIP limit eviction task concurrency	2023-10-09 19:47:04 +00:00
Christian Schwarz	b86cd24a23	disable concurrent compaction limit (it wasn't there when I first analyzed the issue)	2023-10-09 19:29:47 +00:00
Christian Schwarz	d85baac608	REPRO: rebase fallout & add some instructions	2023-10-09 19:10:28 +00:00
Christian Schwarz	f06f274b38	REPRO the problem: , uses 430GB of space; 4 seconds load time; constant 20kIOPS after ~20s	2023-10-09 19:10:22 +00:00
Christian Schwarz	d98575f5a6	Revert "revert recent VirtualFile asyncification changes (#5291 )" This reverts commit `ab1f37e908`.	2023-10-09 19:02:59 +00:00
Christian Schwarz	33d0072342	move into library	2023-10-09 21:02:27 +02:00
Christian Schwarz	174bceccb1	commented out the check for just-once-polled, works now, don't understand why though	2023-10-09 19:26:47 +02:00
Christian Schwarz	f5bbba5014	fixes	2023-10-09 17:54:44 +02:00
Christian Schwarz	868cf8aeb5	hand-roll it instead	2023-10-06 18:45:41 +02:00
Christian Schwarz	9f03dd24c2	page_cache: find_victim: prevent starvation	2023-10-05 16:54:02 +02:00
Christian Schwarz	dc96a7604a	page_cache: ensure forward progress on cache miss	2023-10-05 16:51:08 +02:00
Christian Schwarz	d7c94e67ce	inline lock_for_write and try_lock_for_write into memorize_materialized_page Motivation ========== It's the only user, and the name of `_for_write` is wrong as of commit `7a63685cde` Author: Christian Schwarz <christian@neon.tech> Date: Fri Aug 18 19:31:03 2023 +0200 simplify page-caching of EphemeralFile (#4994) Notes ===== This also allows us to get rid of the WriteBufResult type. Also rename `search_mapping_for_write` to `search_mapping_exact`. It makes more sense that way because there is `_for_write`-locking anymore.	2023-10-05 16:01:29 +02:00