wip

Merge branch 'fast-sort' of github.com:neondatabase/neon into fast-sort
Fix field1 input in test data
2026-01-26 06:40:38 +00:00 · 2023-11-03 16:41:57 -04:00 · 2023-11-03 14:44:58 -04:00 · 2023-11-03 14:44:40 -04:00 · 2023-11-02 09:58:06 -04:00 · 2023-11-02 09:57:23 -04:00
211 changed files with 16413 additions and 10202 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -5,4 +5,6 @@ self-hosted-runner:
    - small
    - us-east-2
 config-variables:
+  - REMOTE_STORAGE_AZURE_CONTAINER
+  - REMOTE_STORAGE_AZURE_REGION
  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -203,6 +203,10 @@ runs:
        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }}
      run: |
+        if [ ! -d "${WORKDIR}/report/data/test-cases" ]; then
+          exit 0
+        fi
+
        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW}

        ./scripts/pysync
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -320,6 +320,9 @@ jobs:
      - name: Build neon extensions
        run: mold -run make neon-pg-ext -j$(nproc)

+      - name: Build walproposer-lib
+        run: mold -run make walproposer-lib -j$(nproc)
+
      - name: Run cargo build
        run: |
          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
@@ -335,6 +338,16 @@ jobs:
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3

+          # Run separate tests for real Azure Blob Storage
+          # XXX: replace region with `eu-central-1`-like region
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
+
      - name: Install rust binaries
        run: |
          # Install target binaries
@@ -420,7 +433,7 @@ jobs:
          rerun_flaky: true
          pg_version: ${{ matrix.pg_version }}
        env:
-          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
+          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty

      - name: Merge and upload coverage data
@@ -455,7 +468,7 @@ jobs:
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
+          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

@@ -834,7 +847,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.17.12
+      VM_BUILDER_VERSION: v0.18.5

    steps:
      - name: Checkout
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -32,7 +32,7 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          submodules: true
          fetch-depth: 1
@@ -90,18 +90,21 @@ jobs:

      - name: Build postgres v14
        if: steps.cache_pg_14.outputs.cache-hit != 'true'
-        run: make postgres-v14 -j$(nproc)
+        run: make postgres-v14 -j$(sysctl -n hw.ncpu)

      - name: Build postgres v15
        if: steps.cache_pg_15.outputs.cache-hit != 'true'
-        run: make postgres-v15 -j$(nproc)
+        run: make postgres-v15 -j$(sysctl -n hw.ncpu)

      - name: Build postgres v16
        if: steps.cache_pg_16.outputs.cache-hit != 'true'
-        run: make postgres-v16 -j$(nproc)
+        run: make postgres-v16 -j$(sysctl -n hw.ncpu)

      - name: Build neon extensions
-        run: make neon-pg-ext -j$(nproc)
+        run: make neon-pg-ext -j$(sysctl -n hw.ncpu)
+
+      - name: Build walproposer-lib
+        run: make walproposer-lib -j$(sysctl -n hw.ncpu)

      - name: Run cargo build
        run: cargo build --all --release
@@ -126,7 +129,7 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          submodules: true
          fetch-depth: 1
@@ -135,6 +138,9 @@ jobs:
      - name: Get postgres headers
        run: make postgres-headers -j$(nproc)

+      - name: Build walproposer-lib
+        run: make walproposer-lib -j$(nproc)
+
      - name: Produce the build stats
        run: cargo build --all --release --timings

--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -2,7 +2,7 @@ name: Create Release Branch

 on:
  schedule:
-    - cron: '0 7 * * 2'
+    - cron: '0 7 * * 5'
  workflow_dispatch:

 jobs:
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -26,6 +26,7 @@ members = [
    "libs/tracing-utils",
    "libs/postgres_ffi/wal_craft",
    "libs/vm_monitor",
+    "libs/walproposer",
 ]

 [workspace.package]
@@ -36,6 +37,10 @@ license = "Apache-2.0"
 [workspace.dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
 async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
+azure_core = "0.16"
+azure_identity = "0.16"
+azure_storage = "0.16"
+azure_storage_blobs = "0.16"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
@@ -76,6 +81,7 @@ hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
 hostname = "0.3.1"
+http-types = "2"
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
@@ -155,11 +161,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -180,6 +186,7 @@ tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
 vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
+walproposer = { version = "0.1", path = "./libs/walproposer/" }

 ## Common library dependency
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }
@@ -195,7 +202,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }

 ################# Binary contents sections

--- a/38
+++ b/38
@@ -62,7 +62,7 @@ all: neon postgres neon-pg-ext
 #
 # The 'postgres_ffi' depends on the Postgres headers.
 .PHONY: neon
-neon: postgres-headers
+neon: postgres-headers walproposer-lib
 	+@echo "Compiling Neon"
 	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)

@@ -168,6 +168,42 @@ neon-pg-ext-clean-%:
 	-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 	-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean

+# Build walproposer as a static library. walproposer source code is located
+# in the pgxn/neon directory.
+# 
+# We also need to include libpgport.a and libpgcommon.a, because walproposer
+# uses some functions from those libraries.
+# 
+# Some object files are removed from libpgport.a and libpgcommon.a because
+# they depend on openssl and other libraries that are not included in our
+# Rust build.
+.PHONY: walproposer-lib
+walproposer-lib: neon-pg-ext-v16
+	+@echo "Compiling walproposer-lib"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile walproposer-lib
+	cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
+	cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
+ifeq ($(UNAME_S),Linux)
+	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgport.a \
+		pg_strong_random.o
+	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
+		pg_crc32c.o \
+		hmac_openssl.o \
+		cryptohash_openssl.o \
+		scram-common.o \
+		md5_common.o \
+		checksum_helper.o
+endif
+
+.PHONY: walproposer-lib-clean
+walproposer-lib-clean:
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config \
+		-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
+
 .PHONY: neon-pg-ext
 neon-pg-ext: \
 	neon-pg-ext-v14 \
--- a/4
+++ b/4
@@ -1,5 +1,5 @@
 Neon
 Copyright 2022 Neon Inc.

-The PostgreSQL submodules in vendor/postgres-v14 and vendor/postgres-v15 are licensed under the
-PostgreSQL license. See vendor/postgres-v14/COPYRIGHT and vendor/postgres-v15/COPYRIGHT.
+The PostgreSQL submodules in vendor/ are licensed under the PostgreSQL license.
+See vendor/postgres-vX/COPYRIGHT for details.
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -156,6 +156,7 @@ fn main() -> Result<()> {
                let path = Path::new(sp);
                let file = File::open(path)?;
                spec = Some(serde_json::from_reader(file)?);
+                live_config_allowed = true;
            } else if let Some(id) = compute_id {
                if let Some(cp_base) = control_plane_uri {
                    live_config_allowed = true;
@@ -277,8 +278,9 @@ fn main() -> Result<()> {
        if #[cfg(target_os = "linux")] {
            use std::env;
            use tokio_util::sync::CancellationToken;
-            use tracing::warn;
-            let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
+            let vm_monitor_addr = matches
+                .get_one::<String>("vm-monitor-addr")
+                .expect("--vm-monitor-addr should always be set because it has a default arg");
            let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
            let cgroup = matches.get_one::<String>("cgroup");
            let file_cache_on_disk = matches.get_flag("file-cache-on-disk");
@@ -287,22 +289,16 @@ fn main() -> Result<()> {
            // Note: it seems like you can make a runtime in an inner scope and
            // if you start a task in it it won't be dropped. However, make it
            // in the outermost scope just to be safe.
-            let rt = match (env::var_os("AUTOSCALING"), vm_monitor_addr) {
-                (None, None) => None,
-                (None, Some(_)) => {
-                    warn!("--vm-monitor-addr option set but AUTOSCALING env var not present");
-                    None
-                }
-                (Some(_), None) => {
-                    panic!("AUTOSCALING env var present but --vm-monitor-addr option not set")
-                }
-                (Some(_), Some(_)) => Some(
+            let rt = if env::var_os("AUTOSCALING").is_some() {
+                Some(
                    tokio::runtime::Builder::new_multi_thread()
                        .worker_threads(4)
                        .enable_all()
                        .build()
-                        .expect("failed to create tokio runtime for monitor"),
-                ),
+                        .expect("failed to create tokio runtime for monitor")
+                )
+            } else {
+                None
            };

            // This token is used internally by the monitor to clean up all threads
@@ -313,7 +309,7 @@ fn main() -> Result<()> {
                    Box::leak(Box::new(vm_monitor::Args {
                        cgroup: cgroup.cloned(),
                        pgconnstr: file_cache_connstr.cloned(),
-                        addr: vm_monitor_addr.cloned().unwrap(),
+                        addr: vm_monitor_addr.clone(),
                        file_cache_on_disk,
                    })),
                    token.clone(),
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -252,7 +252,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
                    IF NOT EXISTS (
                        SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
                    THEN
-                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN IN ROLE pg_read_all_data, pg_write_all_data;
+                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION IN ROLE pg_read_all_data, pg_write_all_data;
                        IF array_length(roles, 1) IS NOT NULL THEN
                            EXECUTE format('GRANT neon_superuser TO %s',
                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
@@ -692,10 +692,11 @@ impl ComputeNode {
        // Proceed with post-startup configuration. Note, that order of operations is important.
        let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
        create_neon_superuser(spec, &mut client)?;
+        cleanup_instance(&mut client)?;
        handle_roles(spec, &mut client)?;
        handle_databases(spec, &mut client)?;
        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
-        handle_grants(spec, self.connstr.as_str())?;
+        handle_grants(spec, &mut client, self.connstr.as_str())?;
        handle_extensions(spec, &mut client)?;
        create_availability_check_data(&mut client)?;

@@ -731,10 +732,11 @@ impl ComputeNode {
        // Disable DDL forwarding because control plane already knows about these roles/databases.
        if spec.mode == ComputeMode::Primary {
            client.simple_query("SET neon.forward_ddl = false")?;
+            cleanup_instance(&mut client)?;
            handle_roles(&spec, &mut client)?;
            handle_databases(&spec, &mut client)?;
            handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
-            handle_grants(&spec, self.connstr.as_str())?;
+            handle_grants(&spec, &mut client, self.connstr.as_str())?;
            handle_extensions(&spec, &mut client)?;
        }

--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -1,3 +1,4 @@
+use std::collections::HashMap;
 use std::fmt::Write;
 use std::fs;
 use std::fs::File;
@@ -192,11 +193,16 @@ impl Escaping for PgIdent {
 /// Build a list of existing Postgres roles
 pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
    let postgres_roles = xact
-        .query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
+        .query(
+            "SELECT rolname, rolpassword, rolreplication, rolbypassrls FROM pg_catalog.pg_authid",
+            &[],
+        )?
        .iter()
        .map(|row| Role {
            name: row.get("rolname"),
            encrypted_password: row.get("rolpassword"),
+            replication: Some(row.get("rolreplication")),
+            bypassrls: Some(row.get("rolbypassrls")),
            options: None,
        })
        .collect();
@@ -205,22 +211,37 @@ pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
 }

 /// Build a list of existing Postgres databases
-pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
-    let postgres_dbs = client
+pub fn get_existing_dbs(client: &mut Client) -> Result<HashMap<String, Database>> {
+    // `pg_database.datconnlimit = -2` means that the database is in the
+    // invalid state. See:
+    //   https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
+    let postgres_dbs: Vec<Database> = client
        .query(
-            "SELECT datname, datdba::regrole::text as owner
-               FROM pg_catalog.pg_database;",
+            "SELECT
+                datname AS name,
+                datdba::regrole::text AS owner,
+                NOT datallowconn AS restrict_conn,
+                datconnlimit = - 2 AS invalid
+            FROM
+                pg_catalog.pg_database;",
            &[],
        )?
        .iter()
        .map(|row| Database {
-            name: row.get("datname"),
+            name: row.get("name"),
            owner: row.get("owner"),
+            restrict_conn: row.get("restrict_conn"),
+            invalid: row.get("invalid"),
            options: None,
        })
        .collect();

-    Ok(postgres_dbs)
+    let dbs_map = postgres_dbs
+        .iter()
+        .map(|db| (db.name.clone(), db.clone()))
+        .collect::<HashMap<_, _>>();
+
+    Ok(dbs_map)
 }

 /// Wait for Postgres to become ready to accept connections. It's ready to
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -13,7 +13,7 @@ use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;

 use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse};
-use compute_api::spec::{ComputeSpec, Database, PgIdent, Role};
+use compute_api::spec::{ComputeSpec, PgIdent, Role};

 // Do control plane request and return response if any. In case of error it
 // returns a bool flag indicating whether it makes sense to retry the request
@@ -24,7 +24,7 @@ fn do_control_plane_request(
 ) -> Result<ControlPlaneSpecResponse, (bool, String)> {
    let resp = reqwest::blocking::Client::new()
        .get(uri)
-        .header("Authorization", jwt)
+        .header("Authorization", format!("Bearer {}", jwt))
        .send()
        .map_err(|e| {
            (
@@ -161,6 +161,38 @@ pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
    Ok(())
 }

+/// Compute could be unexpectedly shut down, for example, during the
+/// database dropping. This leaves the database in the invalid state,
+/// which prevents new db creation with the same name. This function
+/// will clean it up before proceeding with catalog updates. All
+/// possible future cleanup operations may go here too.
+#[instrument(skip_all)]
+pub fn cleanup_instance(client: &mut Client) -> Result<()> {
+    let existing_dbs = get_existing_dbs(client)?;
+
+    for (_, db) in existing_dbs {
+        if db.invalid {
+            // After recent commit in Postgres, interrupted DROP DATABASE
+            // leaves the database in the invalid state. According to the
+            // commit message, the only option for user is to drop it again.
+            // See:
+            //   https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
+            //
+            // Postgres Neon extension is done the way, that db is de-registered
+            // in the control plane metadata only after it is dropped. So there is
+            // a chance that it still thinks that db should exist. This means
+            // that it will be re-created by `handle_databases()`. Yet, it's fine
+            // as user can just repeat drop (in vanilla Postgres they would need
+            // to do the same, btw).
+            let query = format!("DROP DATABASE IF EXISTS {}", db.name.pg_quote());
+            info!("dropping invalid database {}", db.name);
+            client.execute(query.as_str(), &[])?;
+        }
+    }
+
+    Ok(())
+}
+
 /// Given a cluster spec json and open transaction it handles roles creation,
 /// deletion and update.
 #[instrument(skip_all)]
@@ -233,6 +265,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
        let action = if let Some(r) = pg_role {
            if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
                || (r.encrypted_password.is_some() && role.encrypted_password.is_none())
+                || !r.bypassrls.unwrap_or(false)
+                || !r.replication.unwrap_or(false)
            {
                RoleAction::Update
            } else if let Some(pg_pwd) = &r.encrypted_password {
@@ -264,13 +298,14 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
        match action {
            RoleAction::None => {}
            RoleAction::Update => {
-                let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
+                let mut query: String =
+                    format!("ALTER ROLE {} BYPASSRLS REPLICATION", name.pg_quote());
                query.push_str(&role.to_pg_options());
                xact.execute(query.as_str(), &[])?;
            }
            RoleAction::Create => {
                let mut query: String = format!(
-                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
+                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("role create query: '{}'", &query);
@@ -379,13 +414,13 @@ fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent
 /// which together provide us idempotency.
 #[instrument(skip_all)]
 pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
-    let existing_dbs: Vec<Database> = get_existing_dbs(client)?;
+    let existing_dbs = get_existing_dbs(client)?;

    // Print a list of existing Postgres databases (only in debug mode)
    if span_enabled!(Level::INFO) {
        info!("postgres databases:");
-        for r in &existing_dbs {
-            info!("    {}:{}", r.name, r.owner);
+        for (dbname, db) in &existing_dbs {
+            info!("    {}:{}", dbname, db.owner);
        }
    }

@@ -439,8 +474,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                "rename_db" => {
                    let new_name = op.new_name.as_ref().unwrap();

-                    // XXX: with a limited number of roles it is fine, but consider making it a HashMap
-                    if existing_dbs.iter().any(|r| r.name == op.name) {
+                    if existing_dbs.get(&op.name).is_some() {
                        let query: String = format!(
                            "ALTER DATABASE {} RENAME TO {}",
                            op.name.pg_quote(),
@@ -457,14 +491,12 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    }

    // Refresh Postgres databases info to handle possible renames
-    let existing_dbs: Vec<Database> = get_existing_dbs(client)?;
+    let existing_dbs = get_existing_dbs(client)?;

    info!("cluster spec databases:");
    for db in &spec.cluster.databases {
        let name = &db.name;
-
-        // XXX: with a limited number of databases it is fine, but consider making it a HashMap
-        let pg_db = existing_dbs.iter().find(|r| r.name == *name);
+        let pg_db = existing_dbs.get(name);

        enum DatabaseAction {
            None,
@@ -530,13 +562,32 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
 /// to allow users creating trusted extensions and re-creating `public` schema, for example.
 #[instrument(skip_all)]
-pub fn handle_grants(spec: &ComputeSpec, connstr: &str) -> Result<()> {
-    info!("cluster spec grants:");
+pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> Result<()> {
+    info!("modifying database permissions");
+    let existing_dbs = get_existing_dbs(client)?;

    // Do some per-database access adjustments. We'd better do this at db creation time,
    // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
    // atomically.
    for db in &spec.cluster.databases {
+        match existing_dbs.get(&db.name) {
+            Some(pg_db) => {
+                if pg_db.restrict_conn || pg_db.invalid {
+                    info!(
+                        "skipping grants for db {} (invalid: {}, connections not allowed: {})",
+                        db.name, pg_db.invalid, pg_db.restrict_conn
+                    );
+                    continue;
+                }
+            }
+            None => {
+                bail!(
+                    "database {} doesn't exist in Postgres after handle_databases()",
+                    db.name
+                );
+            }
+        }
+
        let mut conf = Config::from_str(connstr)?;
        conf.dbname(&db.name);

@@ -575,6 +626,11 @@ pub fn handle_grants(spec: &ComputeSpec, connstr: &str) -> Result<()> {

        // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user.
        // This is needed because since postgres 15 this privilege is removed by default.
+        // TODO: web_access isn't created for almost 1 year. It could be that we have
+        // active users of 1 year old projects, but hopefully not, so check it and
+        // remove this code if possible. The worst thing that could happen is that
+        // user won't be able to use public schema in NEW databases created in the
+        // very OLD project.
        let grant_query = "DO $$\n\
                BEGIN\n\
                    IF EXISTS(\n\
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -28,7 +28,7 @@ mod pg_helpers_tests {
        assert_eq!(
            spec.cluster.settings.as_pg_settings(),
            r#"fsync = off
-wal_level = replica
+wal_level = logical
 hot_standby = on
 neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'
 wal_log_hints = on
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -19,7 +19,7 @@ const COMMAND: &str = "attachment_service";
 pub struct AttachHookRequest {
    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
-    pub pageserver_id: Option<NodeId>,
+    pub node_id: Option<NodeId>,
 }

 #[derive(Serialize, Deserialize)]
@@ -85,7 +85,7 @@ impl AttachmentService {
            .control_plane_api
            .clone()
            .unwrap()
-            .join("attach_hook")
+            .join("attach-hook")
            .unwrap();
        let client = reqwest::blocking::ClientBuilder::new()
            .build()
@@ -93,7 +93,7 @@ impl AttachmentService {

        let request = AttachHookRequest {
            tenant_id,
-            pageserver_id: Some(pageserver_id),
+            node_id: Some(pageserver_id),
        };

        let response = client.post(url).json(&request).send()?;
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -86,7 +86,7 @@ where
        .stdout(process_log_file)
        .stderr(same_file_for_stderr)
        .args(args);
-    let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command));
+    let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
    filled_cmd.envs(envs);

    let pid_file_to_check = match initial_pid_file {
@@ -238,11 +238,13 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
    filled_cmd
 }

-fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
+fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
    for env_key in [
        "AWS_ACCESS_KEY_ID",
        "AWS_SECRET_ACCESS_KEY",
        "AWS_SESSION_TOKEN",
+        "AZURE_STORAGE_ACCOUNT",
+        "AZURE_STORAGE_ACCESS_KEY",
    ] {
        if let Ok(value) = std::env::var(env_key) {
            cmd = cmd.env(env_key, value);
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -12,7 +12,9 @@ use hyper::{Body, Request, Response};
 use serde::{Deserialize, Serialize};
 use std::path::{Path, PathBuf};
 use std::{collections::HashMap, sync::Arc};
+use utils::http::endpoint::request_span;
 use utils::logging::{self, LogFormat};
+use utils::signals::{ShutdownSignals, Signal};

 use utils::{
    http::{
@@ -170,7 +172,7 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
            state.generation += 1;
            response.tenants.push(ReAttachResponseTenant {
                id: *t,
-                generation: state.generation,
+                gen: state.generation,
            });
        }
    }
@@ -216,14 +218,31 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
        .tenants
        .entry(attach_req.tenant_id)
        .or_insert_with(|| TenantState {
-            pageserver: attach_req.pageserver_id,
+            pageserver: attach_req.node_id,
            generation: 0,
        });

-    if attach_req.pageserver_id.is_some() {
+    if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
        tenant_state.generation += 1;
+        tracing::info!(
+            tenant_id = %attach_req.tenant_id,
+            ps_id = %attaching_pageserver,
+            generation = %tenant_state.generation,
+            "issuing",
+        );
+    } else if let Some(ps_id) = tenant_state.pageserver {
+        tracing::info!(
+            tenant_id = %attach_req.tenant_id,
+            %ps_id,
+            generation = %tenant_state.generation,
+            "dropping",
+        );
+    } else {
+        tracing::info!(
+            tenant_id = %attach_req.tenant_id,
+            "no-op: tenant already has no pageserver");
    }
-    tenant_state.pageserver = attach_req.pageserver_id;
+    tenant_state.pageserver = attach_req.node_id;
    let generation = tenant_state.generation;

    locked.save().await.map_err(ApiError::InternalServerError)?;
@@ -231,7 +250,7 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
    json_response(
        StatusCode::OK,
        AttachHookResponse {
-            gen: attach_req.pageserver_id.map(|_| generation),
+            gen: attach_req.node_id.map(|_| generation),
        },
    )
 }
@@ -239,9 +258,9 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
 fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
    endpoint::make_router()
        .data(Arc::new(State::new(persistent_state)))
-        .post("/re-attach", handle_re_attach)
-        .post("/validate", handle_validate)
-        .post("/attach_hook", handle_attach_hook)
+        .post("/re-attach", |r| request_span(r, handle_re_attach))
+        .post("/validate", |r| request_span(r, handle_validate))
+        .post("/attach-hook", |r| request_span(r, handle_attach_hook))
 }

 #[tokio::main]
@@ -268,7 +287,16 @@ async fn main() -> anyhow::Result<()> {
    let server = hyper::Server::from_tcp(http_listener)?.serve(service);

    tracing::info!("Serving on {0}", args.listen);
-    server.await?;
+
+    tokio::task::spawn(server);
+
+    ShutdownSignals::handle(|signal| match signal {
+        Signal::Interrupt | Signal::Terminate | Signal::Quit => {
+            tracing::info!("Got {}. Terminating", signal.name());
+            // We're just a test helper: no graceful shutdown.
+            std::process::exit(0);
+        }
+    })?;

    Ok(())
 }
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -798,6 +798,24 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                ep.start(&auth_token, safekeepers, remote_ext_config)?;
            }
        }
+        "reconfigure" => {
+            let endpoint_id = sub_args
+                .get_one::<String>("endpoint_id")
+                .ok_or_else(|| anyhow!("No endpoint ID provided to reconfigure"))?;
+            let endpoint = cplane
+                .endpoints
+                .get(endpoint_id.as_str())
+                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
+            let pageserver_id =
+                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
+                    Some(NodeId(
+                        id_str.parse().context("while parsing pageserver id")?,
+                    ))
+                } else {
+                    None
+                };
+            endpoint.reconfigure(pageserver_id)?;
+        }
        "stop" => {
            let endpoint_id = sub_args
                .get_one::<String>("endpoint_id")
@@ -1369,6 +1387,12 @@ fn cli() -> Command {
                    .arg(safekeepers_arg)
                    .arg(remote_ext_config_args)
                )
+                .subcommand(Command::new("reconfigure")
+                            .about("Reconfigure the endpoint")
+                            .arg(endpoint_pageserver_id_arg)
+                            .arg(endpoint_id_arg.clone())
+                            .arg(tenant_id_arg.clone())
+                )
                .subcommand(
                    Command::new("stop")
                    .arg(endpoint_id_arg)
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -253,7 +253,7 @@ impl Endpoint {
        conf.append("shared_buffers", "1MB");
        conf.append("fsync", "off");
        conf.append("max_connections", "100");
-        conf.append("wal_level", "replica");
+        conf.append("wal_level", "logical");
        // wal_sender_timeout is the maximum time to wait for WAL replication.
        // It also defines how often the walreciever will send a feedback message to the wal sender.
        conf.append("wal_sender_timeout", "5s");
@@ -414,18 +414,34 @@ impl Endpoint {
            );
        }

-        // Also wait for the compute_ctl process to die. It might have some cleanup
-        // work to do after postgres stops, like syncing safekeepers, etc.
-        //
+        Ok(())
+    }
+
+    fn wait_for_compute_ctl_to_exit(&self) -> Result<()> {
        // TODO use background_process::stop_process instead
        let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
        let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
        let pid = nix::unistd::Pid::from_raw(pid as i32);
        crate::background_process::wait_until_stopped("compute_ctl", pid)?;
-
        Ok(())
    }

+    fn read_postgresql_conf(&self) -> Result<String> {
+        // Slurp the endpoints/<endpoint id>/postgresql.conf file into
+        // memory. We will include it in the spec file that we pass to
+        // `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
+        // in the data directory.
+        let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
+        match std::fs::read(&postgresql_conf_path) {
+            Ok(content) => Ok(String::from_utf8(content)?),
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok("".to_string()),
+            Err(e) => Err(anyhow::Error::new(e).context(format!(
+                "failed to read config file in {}",
+                postgresql_conf_path.to_str().unwrap()
+            ))),
+        }
+    }
+
    pub fn start(
        &self,
        auth_token: &Option<String>,
@@ -436,21 +452,7 @@ impl Endpoint {
            anyhow::bail!("The endpoint is already running");
        }

-        // Slurp the endpoints/<endpoint id>/postgresql.conf file into
-        // memory. We will include it in the spec file that we pass to
-        // `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
-        // in the data directory.
-        let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
-        let postgresql_conf = match std::fs::read(&postgresql_conf_path) {
-            Ok(content) => String::from_utf8(content)?,
-            Err(e) if e.kind() == std::io::ErrorKind::NotFound => "".to_string(),
-            Err(e) => {
-                return Err(anyhow::Error::new(e).context(format!(
-                    "failed to read config file in {}",
-                    postgresql_conf_path.to_str().unwrap()
-                )))
-            }
-        };
+        let postgresql_conf = self.read_postgresql_conf()?;

        // We always start the compute node from scratch, so if the Postgres
        // data dir exists from a previous launch, remove it first.
@@ -621,6 +623,61 @@ impl Endpoint {
        }
    }

+    pub fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
+        let mut spec: ComputeSpec = {
+            let spec_path = self.endpoint_path().join("spec.json");
+            let file = std::fs::File::open(spec_path)?;
+            serde_json::from_reader(file)?
+        };
+
+        let postgresql_conf = self.read_postgresql_conf()?;
+        spec.cluster.postgresql_conf = Some(postgresql_conf);
+
+        if let Some(pageserver_id) = pageserver_id {
+            let endpoint_config_path = self.endpoint_path().join("endpoint.json");
+            let mut endpoint_conf: EndpointConf = {
+                let file = std::fs::File::open(&endpoint_config_path)?;
+                serde_json::from_reader(file)?
+            };
+            endpoint_conf.pageserver_id = pageserver_id;
+            std::fs::write(
+                endpoint_config_path,
+                serde_json::to_string_pretty(&endpoint_conf)?,
+            )?;
+
+            let pageserver =
+                PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
+            let ps_http_conf = &pageserver.pg_connection_config;
+            let (host, port) = (ps_http_conf.host(), ps_http_conf.port());
+            spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
+        }
+
+        let client = reqwest::blocking::Client::new();
+        let response = client
+            .post(format!(
+                "http://{}:{}/configure",
+                self.http_address.ip(),
+                self.http_address.port()
+            ))
+            .body(format!(
+                "{{\"spec\":{}}}",
+                serde_json::to_string_pretty(&spec)?
+            ))
+            .send()?;
+
+        let status = response.status();
+        if !(status.is_client_error() || status.is_server_error()) {
+            Ok(())
+        } else {
+            let url = response.url().to_owned();
+            let msg = match response.text() {
+                Ok(err_body) => format!("Error: {}", err_body),
+                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
+            };
+            Err(anyhow::anyhow!(msg))
+        }
+    }
+
    pub fn stop(&self, destroy: bool) -> Result<()> {
        // If we are going to destroy data directory,
        // use immediate shutdown mode, otherwise,
@@ -629,15 +686,25 @@ impl Endpoint {
        // Postgres is always started from scratch, so stop
        // without destroy only used for testing and debugging.
        //
+        self.pg_ctl(
+            if destroy {
+                &["-m", "immediate", "stop"]
+            } else {
+                &["stop"]
+            },
+            &None,
+        )?;
+
+        // Also wait for the compute_ctl process to die. It might have some cleanup
+        // work to do after postgres stops, like syncing safekeepers, etc.
+        //
+        self.wait_for_compute_ctl_to_exit()?;
        if destroy {
-            self.pg_ctl(&["-m", "immediate", "stop"], &None)?;
            println!(
                "Destroying postgres data directory '{}'",
                self.pgdata().to_str().unwrap()
            );
            std::fs::remove_dir_all(self.endpoint_path())?;
-        } else {
-            self.pg_ctl(&["stop"], &None)?;
        }
        Ok(())
    }
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
@@ -25,7 +25,7 @@
            },
            {
                "name": "wal_level",
-                "value": "replica",
+                "value": "logical",
                "vartype": "enum"
            },
            {
--- a/docs/error-handling.md
+++ b/docs/error-handling.md
@@ -188,11 +188,60 @@ that.

 ## Error message style

+### PostgreSQL extensions
+
 PostgreSQL has a style guide for writing error messages:

 https://www.postgresql.org/docs/current/error-style-guide.html

 Follow that guide when writing error messages in the PostgreSQL
-extension. We don't follow it strictly in the pageserver and
-safekeeper, but the advice in the PostgreSQL style guide is generally
-good, and you can't go wrong by following it.
+extensions.
+
+### Neon Rust code
+
+#### Anyhow Context
+
+When adding anyhow `context()`, use form `present-tense-verb+action`.
+
+Example:
+- Bad: `file.metadata().context("could not get file metadata")?;`
+- Good: `file.metadata().context("get file metadata")?;`
+
+#### Logging Errors
+
+When logging any error `e`, use `could not {e:#}` or `failed to {e:#}`.
+
+If `e` is an `anyhow` error and you want to log the backtrace that it contains,
+use `{e:?}` instead of `{e:#}`.
+
+#### Rationale
+
+The `{:#}` ("alternate Display") of an `anyhow` error chain is concatenation fo the contexts, using `: `.
+
+For example, the following Rust code will result in output
+```
+ERROR  failed to list users: load users from server: parse response: invalid json
+```
+
+This is more concise / less noisy than what happens if you do `.context("could not ...")?` at each level, i.e.:
+
+```
+ERROR  could not list users: could not load users from server: could not parse response: invalid json
+```
+
+
+```rust
+fn main() {
+  match list_users().context("list users") else {
+    Ok(_) => ...,
+    Err(e) => tracing::error!("failed to {e:#}"),
+  }
+}
+fn list_users() {
+  http_get_users().context("load users from server")?;
+}
+fn http_get_users() {
+  let response = client....?;
+  response.parse().context("parse response")?; // fails with serde error "invalid json"
+}
+```
--- a/docs/pageserver-services.md
+++ b/docs/pageserver-services.md
@@ -96,6 +96,16 @@ prefix_in_bucket = '/test_prefix/'

 `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` env variables can be used to specify the S3 credentials if needed.

+or
+
+```toml
+[remote_storage]
+container_name = 'some-container-name'
+container_region = 'us-east'
+prefix_in_container = '/test-prefix/'
+```
+
+`AZURE_STORAGE_ACCOUNT` and `AZURE_STORAGE_ACCESS_KEY` env variables can be used to specify the azure credentials if needed.

 ## Repository background tasks

--- a/docs/updating-postgres.md
+++ b/docs/updating-postgres.md
@@ -0,0 +1,108 @@
+# Updating Postgres
+
+## Minor Versions
+
+When upgrading to a new minor version of Postgres, please follow these steps:
+
+_Example: 15.4 is the new minor version to upgrade to from 15.3._
+
+1. Clone the Neon Postgres repository if you have not done so already.
+
+    ```shell
+    git clone git@github.com:neondatabase/postgres.git
+    ```
+
+1. Add the Postgres upstream remote.
+
+    ```shell
+    git remote add upstream https://git.postgresql.org/git/postgresql.git
+    ```
+
+1. Create a new branch based on the stable branch you are updating.
+
+    ```shell
+    git checkout -b my-branch REL_15_STABLE_neon
+    ```
+
+1. Tag the last commit on the stable branch you are updating.
+
+    ```shell
+    git tag REL_15_3_neon
+    ```
+
+1. Push the new tag to the Neon Postgres repository.
+
+    ```shell
+    git push origin REL_15_3_neon
+    ```
+
+1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
+
+1. Rebase the branch you created on the tag and resolve any conflicts.
+
+    ```shell
+    git fetch upstream REL_15_4
+    git rebase REL_15_4
+    ```
+
+1. Run the Postgres test suite to make sure our commits have not affected
+Postgres in a negative way.
+
+    ```shell
+    make check
+    # OR
+    meson test -C builddir
+    ```
+
+1. Push your branch to the Neon Postgres repository.
+
+    ```shell
+    git push origin my-branch
+    ```
+
+1. Clone the Neon repository if you have not done so already.
+
+    ```shell
+    git clone git@github.com:neondatabase/neon.git
+    ```
+
+1. Create a new branch.
+
+1. Change the `revisions.json` file to point at the HEAD of your Postgres
+branch.
+
+1. Update the Git submodule.
+
+    ```shell
+    git submodule set-branch --branch my-branch vendor/postgres-v15
+    git submodule update --remote vendor/postgres-v15
+    ```
+
+1. Run the Neon test suite to make sure that Neon is still good to go on this
+minor Postgres release.
+
+    ```shell
+    ./scripts/poetry -k pg15
+    ```
+
+1. Commit your changes.
+
+1. Create a pull request, and wait for CI to go green.
+
+1. Force push the rebased Postgres branches into the Neon Postgres repository.
+
+    ```shell
+    git push --force origin my-branch:REL_15_STABLE_neon
+    ```
+
+    It may require disabling various branch protections.
+
+1. Update your Neon PR to point at the branches.
+
+    ```shell
+    git submodule set-branch --branch REL_15_STABLE_neon vendor/postgres-v15
+    git commit --amend --no-edit
+    git push --force origin
+    ```
+
+1. Merge the pull request after getting approval(s) and CI completion.
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -190,6 +190,8 @@ pub struct DeltaOp {
 pub struct Role {
    pub name: PgIdent,
    pub encrypted_password: Option<String>,
+    pub replication: Option<bool>,
+    pub bypassrls: Option<bool>,
    pub options: GenericOptions,
 }

@@ -200,6 +202,12 @@ pub struct Database {
    pub name: PgIdent,
    pub owner: PgIdent,
    pub options: GenericOptions,
+    // These are derived flags, not present in the spec file.
+    // They are never set by the control plane.
+    #[serde(skip_deserializing, default)]
+    pub restrict_conn: bool,
+    #[serde(skip_deserializing, default)]
+    pub invalid: bool,
 }

 /// Common type representing both SQL statement params with or without value,
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
@@ -76,7 +76,7 @@
            },
            {
                "name": "wal_level",
-                "value": "replica",
+                "value": "logical",
                "vartype": "enum"
            },
            {
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -89,14 +89,14 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
    0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
 ];

-pub fn set_build_info_metric(revision: &str) {
+pub fn set_build_info_metric(revision: &str, build_tag: &str) {
    let metric = register_int_gauge_vec!(
        "libmetrics_build_info",
        "Build/version information",
-        &["revision"]
+        &["revision", "build_tag"]
    )
    .expect("Failed to register build info metric");
-    metric.with_label_values(&[revision]).set(1);
+    metric.with_label_values(&[revision, build_tag]).set(1);
 }

 // Records I/O stats in a "cross-platform" way.
--- a/libs/metrics/src/wrappers.rs
+++ b/libs/metrics/src/wrappers.rs
@@ -1,6 +1,6 @@
 use std::io::{Read, Result, Write};

-/// A wrapper for an object implementing [Read](std::io::Read)
+/// A wrapper for an object implementing [Read]
 /// which allows a closure to observe the amount of bytes read.
 /// This is useful in conjunction with metrics (e.g. [IntCounter](crate::IntCounter)).
 ///
@@ -51,17 +51,17 @@ impl<'a, T> CountedReader<'a, T> {
        }
    }

-    /// Get an immutable reference to the underlying [Read](std::io::Read) implementor
+    /// Get an immutable reference to the underlying [Read] implementor
    pub fn inner(&self) -> &T {
        &self.reader
    }

-    /// Get a mutable reference to the underlying [Read](std::io::Read) implementor
+    /// Get a mutable reference to the underlying [Read] implementor
    pub fn inner_mut(&mut self) -> &mut T {
        &mut self.reader
    }

-    /// Consume the wrapper and return the underlying [Read](std::io::Read) implementor
+    /// Consume the wrapper and return the underlying [Read] implementor
    pub fn into_inner(self) -> T {
        self.reader
    }
@@ -75,7 +75,7 @@ impl<T: Read> Read for CountedReader<'_, T> {
    }
 }

-/// A wrapper for an object implementing [Write](std::io::Write)
+/// A wrapper for an object implementing [Write]
 /// which allows a closure to observe the amount of bytes written.
 /// This is useful in conjunction with metrics (e.g. [IntCounter](crate::IntCounter)).
 ///
@@ -122,17 +122,17 @@ impl<'a, T> CountedWriter<'a, T> {
        }
    }

-    /// Get an immutable reference to the underlying [Write](std::io::Write) implementor
+    /// Get an immutable reference to the underlying [Write] implementor
    pub fn inner(&self) -> &T {
        &self.writer
    }

-    /// Get a mutable reference to the underlying [Write](std::io::Write) implementor
+    /// Get a mutable reference to the underlying [Write] implementor
    pub fn inner_mut(&mut self) -> &mut T {
        &mut self.writer
    }

-    /// Consume the wrapper and return the underlying [Write](std::io::Write) implementor
+    /// Consume the wrapper and return the underlying [Write] implementor
    pub fn into_inner(self) -> T {
        self.writer
    }
--- a/libs/pageserver_api/src/control_api.rs
+++ b/libs/pageserver_api/src/control_api.rs
@@ -17,7 +17,7 @@ pub struct ReAttachRequest {
 pub struct ReAttachResponseTenant {
    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
-    pub generation: u32,
+    pub gen: u32,
 }

 #[derive(Serialize, Deserialize)]
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -110,7 +110,6 @@ impl TenantState {
            // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
            Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
            // tenant mgr startup distinguishes attaching from loading via marker file.
-            // If it's loading, there is no attach marker file, i.e., attach had finished in the past.
            Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
            // We only reach Active after successful load / attach.
            // So, call atttachment status Attached.
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -22,9 +22,9 @@ use postgres_ffi::Oid;
 /// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
 ///
 // FIXME: should move 'forknum' as last field to keep this consistent with Postgres.
-// Then we could replace the custo Ord and PartialOrd implementations below with
-// deriving them.
-#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)]
+// Then we could replace the custom Ord and PartialOrd implementations below with
+// deriving them. This will require changes in walredoproc.c.
+#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize)]
 pub struct RelTag {
    pub forknum: u8,
    pub spcnode: Oid,
@@ -40,21 +40,9 @@ impl PartialOrd for RelTag {

 impl Ord for RelTag {
    fn cmp(&self, other: &Self) -> Ordering {
-        let mut cmp = self.spcnode.cmp(&other.spcnode);
-        if cmp != Ordering::Equal {
-            return cmp;
-        }
-        cmp = self.dbnode.cmp(&other.dbnode);
-        if cmp != Ordering::Equal {
-            return cmp;
-        }
-        cmp = self.relnode.cmp(&other.relnode);
-        if cmp != Ordering::Equal {
-            return cmp;
-        }
-        cmp = self.forknum.cmp(&other.forknum);
-
-        cmp
+        // Custom ordering where we put forknum to the end of the list
+        let other_tup = (other.spcnode, other.dbnode, other.relnode, other.forknum);
+        (self.spcnode, self.dbnode, self.relnode, self.forknum).cmp(&other_tup)
    }
 }

--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -19,8 +19,8 @@ use tracing::{debug, error, info, trace};

 use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
 use pq_proto::{
-    BeMessage, FeMessage, FeStartupPacket, ProtocolError, SQLSTATE_INTERNAL_ERROR,
-    SQLSTATE_SUCCESSFUL_COMPLETION,
+    BeMessage, FeMessage, FeStartupPacket, ProtocolError, SQLSTATE_ADMIN_SHUTDOWN,
+    SQLSTATE_INTERNAL_ERROR, SQLSTATE_SUCCESSFUL_COMPLETION,
 };

 /// An error, occurred during query processing:
@@ -30,6 +30,9 @@ pub enum QueryError {
    /// The connection was lost while processing the query.
    #[error(transparent)]
    Disconnected(#[from] ConnectionError),
+    /// We were instructed to shutdown while processing the query
+    #[error("Shutting down")]
+    Shutdown,
    /// Some other error
    #[error(transparent)]
    Other(#[from] anyhow::Error),
@@ -44,7 +47,8 @@ impl From<io::Error> for QueryError {
 impl QueryError {
    pub fn pg_error_code(&self) -> &'static [u8; 5] {
        match self {
-            Self::Disconnected(_) => b"08006",         // connection failure
+            Self::Disconnected(_) => b"08006", // connection failure
+            Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
        }
    }
@@ -238,6 +242,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
        }
    }

+    /// Cancellation safe as long as the underlying IO is cancellation safe.
    async fn shutdown(&mut self) -> io::Result<()> {
        match self {
            MaybeWriteOnly::Full(framed) => framed.shutdown().await,
@@ -389,14 +394,37 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        shutdown_watcher: F,
    ) -> Result<(), QueryError>
    where
-        F: Fn() -> S,
+        F: Fn() -> S + Clone,
        S: Future,
    {
-        let ret = self.run_message_loop(handler, shutdown_watcher).await;
-        // socket might be already closed, e.g. if previously received error,
-        // so ignore result.
-        self.framed.shutdown().await.ok();
-        ret
+        let ret = self
+            .run_message_loop(handler, shutdown_watcher.clone())
+            .await;
+
+        tokio::select! {
+            _ = shutdown_watcher() => {
+                // do nothing; we most likely got already stopped by shutdown and will log it next.
+            }
+            _ = self.framed.shutdown() => {
+                // socket might be already closed, e.g. if previously received error,
+                // so ignore result.
+            },
+        }
+
+        match ret {
+            Ok(()) => Ok(()),
+            Err(QueryError::Shutdown) => {
+                info!("Stopped due to shutdown");
+                Ok(())
+            }
+            Err(QueryError::Disconnected(e)) => {
+                info!("Disconnected ({e:#})");
+                // Disconnection is not an error: we just use it that way internally to drop
+                // out of loops.
+                Ok(())
+            }
+            e => e,
+        }
    }

    async fn run_message_loop<F, S>(
@@ -416,15 +444,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
            _ = shutdown_watcher() => {
                // We were requested to shut down.
                tracing::info!("shutdown request received during handshake");
-                return Ok(())
+                return Err(QueryError::Shutdown)
            },

-            result = self.handshake(handler) => {
-                // Handshake complete.
-                result?;
-                if self.state == ProtoState::Closed {
-                    return Ok(()); // EOF during handshake
-                }
+            handshake_r = self.handshake(handler) => {
+                handshake_r?;
            }
        );

@@ -435,7 +459,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
            _ = shutdown_watcher() => {
                // We were requested to shut down.
                tracing::info!("shutdown request received in run_message_loop");
-                Ok(None)
+                return Err(QueryError::Shutdown)
            },
            msg = self.read_message() => { msg },
        )? {
@@ -447,7 +471,14 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                _ = shutdown_watcher() => {
                    // We were requested to shut down.
                    tracing::info!("shutdown request received during response flush");
-                    return Ok(())
+
+                    // If we exited process_message with a shutdown error, there may be
+                    // some valid response content on in our transmit buffer: permit sending
+                    // this within a short timeout.  This is a best effort thing so we don't
+                    // care about the result.
+                    tokio::time::timeout(std::time::Duration::from_millis(500), self.flush()).await.ok();
+
+                    return Err(QueryError::Shutdown)
                },
                flush_r = self.flush() => {
                    flush_r?;
@@ -560,7 +591,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                        self.peer_addr
                    );
                    self.state = ProtoState::Closed;
-                    return Ok(());
+                    return Err(QueryError::Disconnected(ConnectionError::Protocol(
+                        ProtocolError::Protocol("EOF during handshake".to_string()),
+                    )));
                }
            }
        }
@@ -599,7 +632,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                        self.peer_addr
                    );
                    self.state = ProtoState::Closed;
-                    return Ok(());
+                    return Err(QueryError::Disconnected(ConnectionError::Protocol(
+                        ProtocolError::Protocol("EOF during auth".to_string()),
+                    )));
                }
            }
        }
@@ -923,6 +958,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, I
 pub fn short_error(e: &QueryError) -> String {
    match e {
        QueryError::Disconnected(connection_error) => connection_error.to_string(),
+        QueryError::Shutdown => "shutdown".to_string(),
        QueryError::Other(e) => format!("{e:#}"),
    }
 }
@@ -939,6 +975,9 @@ fn log_query_error(query: &str, e: &QueryError) {
        QueryError::Disconnected(other_connection_error) => {
            error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
        }
+        QueryError::Shutdown => {
+            info!("query handler for '{query}' cancelled during tenant shutdown")
+        }
        QueryError::Other(e) => {
            error!("query handler for '{query}' failed: {e:?}");
        }
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -131,6 +131,7 @@ pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;

 // Export some version independent functions that are used outside of this mod
 pub use v14::xlog_utils::encode_logical_message;
+pub use v14::xlog_utils::from_pg_timestamp;
 pub use v14::xlog_utils::get_current_timestamp;
 pub use v14::xlog_utils::to_pg_timestamp;
 pub use v14::xlog_utils::XLogFileName;
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -220,6 +220,10 @@ pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
 pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
 pub const XLP_LONG_HEADER: u16 = 0x0002;

+/* From replication/slot.h */
+pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4  /* offset of `slotdata` in ReplicationSlotOnDisk  */
+   + 64 /* NameData */  + 4*4;
+
 /* From fsm_internals.h */
 const FSM_NODES_PER_PAGE: usize = BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA - 4;
 const FSM_NON_LEAF_NODES_PER_PAGE: usize = BLCKSZ as usize / 2 - 1;
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -136,21 +136,42 @@ pub fn get_current_timestamp() -> TimestampTz {
    to_pg_timestamp(SystemTime::now())
 }

-pub fn to_pg_timestamp(time: SystemTime) -> TimestampTz {
-    const UNIX_EPOCH_JDATE: u64 = 2440588; /* == date2j(1970, 1, 1) */
-    const POSTGRES_EPOCH_JDATE: u64 = 2451545; /* == date2j(2000, 1, 1) */
+// Module to reduce the scope of the constants
+mod timestamp_conversions {
+    use std::time::Duration;
+
+    use super::*;
+
+    const UNIX_EPOCH_JDATE: u64 = 2440588; // == date2j(1970, 1, 1)
+    const POSTGRES_EPOCH_JDATE: u64 = 2451545; // == date2j(2000, 1, 1)
    const SECS_PER_DAY: u64 = 86400;
    const USECS_PER_SEC: u64 = 1000000;
-    match time.duration_since(SystemTime::UNIX_EPOCH) {
-        Ok(n) => {
-            ((n.as_secs() - ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY))
-                * USECS_PER_SEC
-                + n.subsec_micros() as u64) as i64
+    const SECS_DIFF_UNIX_TO_POSTGRES_EPOCH: u64 =
+        (POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY;
+
+    pub fn to_pg_timestamp(time: SystemTime) -> TimestampTz {
+        match time.duration_since(SystemTime::UNIX_EPOCH) {
+            Ok(n) => {
+                ((n.as_secs() - SECS_DIFF_UNIX_TO_POSTGRES_EPOCH) * USECS_PER_SEC
+                    + n.subsec_micros() as u64) as i64
+            }
+            Err(_) => panic!("SystemTime before UNIX EPOCH!"),
        }
-        Err(_) => panic!("SystemTime before UNIX EPOCH!"),
+    }
+
+    pub fn from_pg_timestamp(time: TimestampTz) -> SystemTime {
+        let time: u64 = time
+            .try_into()
+            .expect("timestamp before millenium (postgres epoch)");
+        let since_unix_epoch = time + SECS_DIFF_UNIX_TO_POSTGRES_EPOCH * USECS_PER_SEC;
+        SystemTime::UNIX_EPOCH
+            .checked_add(Duration::from_micros(since_unix_epoch))
+            .expect("SystemTime overflow")
    }
 }

+pub use timestamp_conversions::{from_pg_timestamp, to_pg_timestamp};
+
 // Returns (aligned) end_lsn of the last record in data_dir with WAL segments.
 // start_lsn must point to some previously known record boundary (beginning of
 // the next record). If no valid record after is found, start_lsn is returned
@@ -481,4 +502,24 @@ pub fn encode_logical_message(prefix: &str, message: &str) -> Vec<u8> {
    wal
 }

-// If you need to craft WAL and write tests for this module, put it at wal_craft crate.
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_ts_conversion() {
+        let now = SystemTime::now();
+        let round_trip = from_pg_timestamp(to_pg_timestamp(now));
+
+        let now_since = now.duration_since(SystemTime::UNIX_EPOCH).unwrap();
+        let round_trip_since = round_trip.duration_since(SystemTime::UNIX_EPOCH).unwrap();
+        assert_eq!(now_since.as_micros(), round_trip_since.as_micros());
+
+        let now_pg = get_current_timestamp();
+        let round_trip_pg = to_pg_timestamp(from_pg_timestamp(now_pg));
+
+        assert_eq!(now_pg, round_trip_pg);
+    }
+
+    // If you need to craft WAL and write tests for this module, put it at wal_craft crate.
+}
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -14,6 +14,7 @@ macro_rules! xlog_utils_test {
    ($version:ident) => {
        #[path = "."]
        mod $version {
+            #[allow(unused_imports)]
            pub use postgres_ffi::$version::wal_craft_test_export::*;
            #[allow(clippy::duplicate_mod)]
            #[cfg(test)]
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -214,27 +214,24 @@ where
    }
 }

+/// Cancellation safe as long as the AsyncWrite is cancellation safe.
 async fn flush<S: AsyncWrite + Unpin>(
    stream: &mut S,
    write_buf: &mut BytesMut,
 ) -> Result<(), io::Error> {
    while write_buf.has_remaining() {
-        let bytes_written = stream.write(write_buf.chunk()).await?;
+        let bytes_written = stream.write_buf(write_buf).await?;
        if bytes_written == 0 {
            return Err(io::Error::new(
                ErrorKind::WriteZero,
                "failed to write message",
            ));
        }
-        // The advanced part will be garbage collected, likely during shifting
-        // data left on next attempt to write to buffer when free space is not
-        // enough.
-        write_buf.advance(bytes_written);
    }
-    write_buf.clear();
    stream.flush().await
 }

+/// Cancellation safe as long as the AsyncWrite is cancellation safe.
 async fn shutdown<S: AsyncWrite + Unpin>(
    stream: &mut S,
    write_buf: &mut BytesMut,
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -670,6 +670,7 @@ pub fn read_cstr(buf: &mut Bytes) -> Result<Bytes, ProtocolError> {
 }

 pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000";
+pub const SQLSTATE_ADMIN_SHUTDOWN: &[u8; 5] = b"57P01";
 pub const SQLSTATE_SUCCESSFUL_COMPLETION: &[u8; 5] = b"00000";

 impl<'a> BeMessage<'a> {
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -13,6 +13,7 @@ aws-types.workspace = true
 aws-config.workspace = true
 aws-sdk-s3.workspace = true
 aws-credential-types.workspace = true
+bytes.workspace = true
 camino.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 serde.workspace = true
@@ -26,6 +27,13 @@ metrics.workspace = true
 utils.workspace = true
 pin-project-lite.workspace = true
 workspace_hack.workspace = true
+azure_core.workspace = true
+azure_identity.workspace = true
+azure_storage.workspace = true
+azure_storage_blobs.workspace = true
+futures-util.workspace = true
+http-types.workspace = true
+itertools.workspace = true

 [dev-dependencies]
 camino-tempfile.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -0,0 +1,337 @@
+//! Azure Blob Storage wrapper
+
+use std::env;
+use std::num::NonZeroU32;
+use std::sync::Arc;
+use std::{borrow::Cow, collections::HashMap, io::Cursor};
+
+use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
+use anyhow::Result;
+use azure_core::request_options::{MaxResults, Metadata, Range};
+use azure_core::Header;
+use azure_identity::DefaultAzureCredential;
+use azure_storage::StorageCredentials;
+use azure_storage_blobs::prelude::ClientBuilder;
+use azure_storage_blobs::{
+    blob::operations::GetBlobBuilder,
+    prelude::{BlobClient, ContainerClient},
+};
+use futures_util::StreamExt;
+use http_types::StatusCode;
+use tokio::io::AsyncRead;
+use tracing::debug;
+
+use crate::s3_bucket::RequestKind;
+use crate::{
+    AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath,
+    RemoteStorage, StorageMetadata,
+};
+
+pub struct AzureBlobStorage {
+    client: ContainerClient,
+    prefix_in_container: Option<String>,
+    max_keys_per_list_response: Option<NonZeroU32>,
+    concurrency_limiter: ConcurrencyLimiter,
+}
+
+impl AzureBlobStorage {
+    pub fn new(azure_config: &AzureConfig) -> Result<Self> {
+        debug!(
+            "Creating azure remote storage for azure container {}",
+            azure_config.container_name
+        );
+
+        let account = env::var("AZURE_STORAGE_ACCOUNT").expect("missing AZURE_STORAGE_ACCOUNT");
+
+        // If the `AZURE_STORAGE_ACCESS_KEY` env var has an access key, use that,
+        // otherwise try the token based credentials.
+        let credentials = if let Ok(access_key) = env::var("AZURE_STORAGE_ACCESS_KEY") {
+            StorageCredentials::access_key(account.clone(), access_key)
+        } else {
+            let token_credential = DefaultAzureCredential::default();
+            StorageCredentials::token_credential(Arc::new(token_credential))
+        };
+
+        let builder = ClientBuilder::new(account, credentials);
+
+        let client = builder.container_client(azure_config.container_name.to_owned());
+
+        let max_keys_per_list_response =
+            if let Some(limit) = azure_config.max_keys_per_list_response {
+                Some(
+                    NonZeroU32::new(limit as u32)
+                        .ok_or_else(|| anyhow::anyhow!("max_keys_per_list_response can't be 0"))?,
+                )
+            } else {
+                None
+            };
+
+        Ok(AzureBlobStorage {
+            client,
+            prefix_in_container: azure_config.prefix_in_container.to_owned(),
+            max_keys_per_list_response,
+            concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
+        })
+    }
+
+    pub fn relative_path_to_name(&self, path: &RemotePath) -> String {
+        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
+        let path_string = path
+            .get_path()
+            .as_str()
+            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
+        match &self.prefix_in_container {
+            Some(prefix) => {
+                if prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                    prefix.clone() + path_string
+                } else {
+                    format!("{prefix}{REMOTE_STORAGE_PREFIX_SEPARATOR}{path_string}")
+                }
+            }
+            None => path_string.to_string(),
+        }
+    }
+
+    fn name_to_relative_path(&self, key: &str) -> RemotePath {
+        let relative_path =
+            match key.strip_prefix(self.prefix_in_container.as_deref().unwrap_or_default()) {
+                Some(stripped) => stripped,
+                // we rely on Azure to return properly prefixed paths
+                // for requests with a certain prefix
+                None => panic!(
+                    "Key {key} does not start with container prefix {:?}",
+                    self.prefix_in_container
+                ),
+            };
+        RemotePath(
+            relative_path
+                .split(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                .collect(),
+        )
+    }
+
+    async fn download_for_builder(
+        &self,
+        metadata: StorageMetadata,
+        builder: GetBlobBuilder,
+    ) -> Result<Download, DownloadError> {
+        let mut response = builder.into_stream();
+
+        // TODO give proper streaming response instead of buffering into RAM
+        // https://github.com/neondatabase/neon/issues/5563
+        let mut buf = Vec::new();
+        while let Some(part) = response.next().await {
+            let part = part.map_err(to_download_error)?;
+            let data = part
+                .data
+                .collect()
+                .await
+                .map_err(|e| DownloadError::Other(e.into()))?;
+            buf.extend_from_slice(&data.slice(..));
+        }
+        Ok(Download {
+            download_stream: Box::pin(Cursor::new(buf)),
+            metadata: Some(metadata),
+        })
+    }
+    // TODO get rid of this function once we have metadata included in the response
+    // https://github.com/Azure/azure-sdk-for-rust/issues/1439
+    async fn get_metadata(
+        &self,
+        blob_client: &BlobClient,
+    ) -> Result<StorageMetadata, DownloadError> {
+        let builder = blob_client.get_metadata();
+
+        let response = builder.into_future().await.map_err(to_download_error)?;
+        let mut map = HashMap::new();
+
+        for md in response.metadata.iter() {
+            map.insert(
+                md.name().as_str().to_string(),
+                md.value().as_str().to_string(),
+            );
+        }
+        Ok(StorageMetadata(map))
+    }
+
+    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
+        self.concurrency_limiter
+            .acquire(kind)
+            .await
+            .expect("semaphore is never closed")
+    }
+}
+
+fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
+    let mut res = Metadata::new();
+    for (k, v) in metadata.0.into_iter() {
+        res.insert(k, v);
+    }
+    res
+}
+
+fn to_download_error(error: azure_core::Error) -> DownloadError {
+    if let Some(http_err) = error.as_http_error() {
+        match http_err.status() {
+            StatusCode::NotFound => DownloadError::NotFound,
+            StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(error)),
+            _ => DownloadError::Other(anyhow::Error::new(error)),
+        }
+    } else {
+        DownloadError::Other(error.into())
+    }
+}
+
+#[async_trait::async_trait]
+impl RemoteStorage for AzureBlobStorage {
+    async fn list(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+    ) -> anyhow::Result<Listing, DownloadError> {
+        // get the passed prefix or if it is not set use prefix_in_bucket value
+        let list_prefix = prefix
+            .map(|p| self.relative_path_to_name(p))
+            .or_else(|| self.prefix_in_container.clone())
+            .map(|mut p| {
+                // required to end with a separator
+                // otherwise request will return only the entry of a prefix
+                if matches!(mode, ListingMode::WithDelimiter)
+                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                {
+                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+                }
+                p
+            });
+
+        let mut builder = self.client.list_blobs();
+
+        if let ListingMode::WithDelimiter = mode {
+            builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
+        }
+
+        if let Some(prefix) = list_prefix {
+            builder = builder.prefix(Cow::from(prefix.to_owned()));
+        }
+
+        if let Some(limit) = self.max_keys_per_list_response {
+            builder = builder.max_results(MaxResults::new(limit));
+        }
+
+        let mut response = builder.into_stream();
+        let mut res = Listing::default();
+        while let Some(l) = response.next().await {
+            let entry = l.map_err(to_download_error)?;
+            let prefix_iter = entry
+                .blobs
+                .prefixes()
+                .map(|prefix| self.name_to_relative_path(&prefix.name));
+            res.prefixes.extend(prefix_iter);
+
+            let blob_iter = entry
+                .blobs
+                .blobs()
+                .map(|k| self.name_to_relative_path(&k.name));
+            res.keys.extend(blob_iter);
+        }
+        Ok(res)
+    }
+    async fn upload(
+        &self,
+        mut from: impl AsyncRead + Unpin + Send + Sync + 'static,
+        data_size_bytes: usize,
+        to: &RemotePath,
+        metadata: Option<StorageMetadata>,
+    ) -> anyhow::Result<()> {
+        let _permit = self.permit(RequestKind::Put).await;
+        let blob_client = self.client.blob_client(self.relative_path_to_name(to));
+
+        // TODO FIX THIS UGLY HACK and don't buffer the entire object
+        // into RAM here, but use the streaming interface. For that,
+        // we'd have to change the interface though...
+        // https://github.com/neondatabase/neon/issues/5563
+        let mut buf = Vec::with_capacity(data_size_bytes);
+        tokio::io::copy(&mut from, &mut buf).await?;
+        let body = azure_core::Body::Bytes(buf.into());
+
+        let mut builder = blob_client.put_block_blob(body);
+
+        if let Some(metadata) = metadata {
+            builder = builder.metadata(to_azure_metadata(metadata));
+        }
+
+        let _response = builder.into_future().await?;
+
+        Ok(())
+    }
+
+    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+        let _permit = self.permit(RequestKind::Get).await;
+        let blob_client = self.client.blob_client(self.relative_path_to_name(from));
+
+        let metadata = self.get_metadata(&blob_client).await?;
+
+        let builder = blob_client.get();
+
+        self.download_for_builder(metadata, builder).await
+    }
+
+    async fn download_byte_range(
+        &self,
+        from: &RemotePath,
+        start_inclusive: u64,
+        end_exclusive: Option<u64>,
+    ) -> Result<Download, DownloadError> {
+        let _permit = self.permit(RequestKind::Get).await;
+        let blob_client = self.client.blob_client(self.relative_path_to_name(from));
+
+        let metadata = self.get_metadata(&blob_client).await?;
+
+        let mut builder = blob_client.get();
+
+        if let Some(end_exclusive) = end_exclusive {
+            builder = builder.range(Range::new(start_inclusive, end_exclusive));
+        } else {
+            // Open ranges are not supported by the SDK so we work around
+            // by setting the upper limit extremely high (but high enough
+            // to still be representable by signed 64 bit integers).
+            // TODO remove workaround once the SDK adds open range support
+            // https://github.com/Azure/azure-sdk-for-rust/issues/1438
+            let end_exclusive = u64::MAX / 4;
+            builder = builder.range(Range::new(start_inclusive, end_exclusive));
+        }
+
+        self.download_for_builder(metadata, builder).await
+    }
+
+    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
+        let _permit = self.permit(RequestKind::Delete).await;
+        let blob_client = self.client.blob_client(self.relative_path_to_name(path));
+
+        let builder = blob_client.delete();
+
+        match builder.into_future().await {
+            Ok(_response) => Ok(()),
+            Err(e) => {
+                if let Some(http_err) = e.as_http_error() {
+                    if http_err.status() == StatusCode::NotFound {
+                        return Ok(());
+                    }
+                }
+                Err(anyhow::Error::new(e))
+            }
+        }
+    }
+
+    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
+        // Permit is already obtained by inner delete function
+
+        // TODO batch requests are also not supported by the SDK
+        // https://github.com/Azure/azure-sdk-for-rust/issues/1068
+        // https://github.com/Azure/azure-sdk-for-rust/issues/1249
+        for path in paths {
+            self.delete(path).await?;
+        }
+        Ok(())
+    }
+}
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -4,7 +4,10 @@
 //! [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
 //!   * [`local_fs`] allows to use local file system as an external storage
 //!   * [`s3_bucket`] uses AWS S3 bucket as an external storage
+//!   * [`azure_blob`] allows to use Azure Blob storage as an external storage
 //!
+
+mod azure_blob;
 mod local_fs;
 mod s3_bucket;
 mod simulate_failures;
@@ -21,11 +24,15 @@ use anyhow::{bail, Context};
 use camino::{Utf8Path, Utf8PathBuf};

 use serde::{Deserialize, Serialize};
-use tokio::io;
+use tokio::{io, sync::Semaphore};
 use toml_edit::Item;
 use tracing::info;

-pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket, simulate_failures::UnreliableWrapper};
+pub use self::{
+    azure_blob::AzureBlobStorage, local_fs::LocalFs, s3_bucket::S3Bucket,
+    simulate_failures::UnreliableWrapper,
+};
+use s3_bucket::RequestKind;

 /// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
 /// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
@@ -39,6 +46,11 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
 /// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
 pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
+/// We set this a little bit low as we currently buffer the entire file into RAM
+///
+/// Here, a limit of max 20k concurrent connections was noted.
+/// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
+pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 30;
 /// No limits on the client side, which currenltly means 1000 for AWS S3.
 /// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
@@ -117,6 +129,22 @@ impl RemotePath {
    }
 }

+/// We don't need callers to be able to pass arbitrary delimiters: just control
+/// whether listings will use a '/' separator or not.
+///
+/// The WithDelimiter mode will populate `prefixes` and `keys` in the result.  The
+/// NoDelimiter mode will only populate `keys`.
+pub enum ListingMode {
+    WithDelimiter,
+    NoDelimiter,
+}
+
+#[derive(Default)]
+pub struct Listing {
+    pub prefixes: Vec<RemotePath>,
+    pub keys: Vec<RemotePath>,
+}
+
 /// Storage (potentially remote) API to manage its state.
 /// This storage tries to be unaware of any layered repository context,
 /// providing basic CRUD operations for storage files.
@@ -129,8 +157,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
    async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
-    ) -> Result<Vec<RemotePath>, DownloadError>;
-
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        let result = self
+            .list(prefix, ListingMode::WithDelimiter)
+            .await?
+            .prefixes;
+        Ok(result)
+    }
    /// Lists all files in directory "recursively"
    /// (not really recursively, because AWS has a flat namespace)
    /// Note: This is subtely different than list_prefixes,
@@ -142,7 +175,16 @@ pub trait RemoteStorage: Send + Sync + 'static {
    /// whereas,
    /// list_prefixes("foo/bar/") = ["cat", "dog"]
    /// See `test_real_s3.rs` for more details.
-    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
+    async fn list_files(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        let result = self.list(prefix, ListingMode::NoDelimiter).await?.keys;
+        Ok(result)
+    }
+
+    async fn list(
+        &self,
+        prefix: Option<&RemotePath>,
+        _mode: ListingMode,
+    ) -> anyhow::Result<Listing, DownloadError>;

    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload(
@@ -193,6 +235,9 @@ pub enum DownloadError {
    BadInput(anyhow::Error),
    /// The file was not found in the remote storage.
    NotFound,
+    /// A cancellation token aborted the download, typically during
+    /// tenant detach or process shutdown.
+    Cancelled,
    /// The file was found in the remote storage, but the download failed.
    Other(anyhow::Error),
 }
@@ -203,6 +248,7 @@ impl std::fmt::Display for DownloadError {
            DownloadError::BadInput(e) => {
                write!(f, "Failed to download a remote file due to user input: {e}")
            }
+            DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
            DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
            DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
        }
@@ -217,10 +263,24 @@ impl std::error::Error for DownloadError {}
 pub enum GenericRemoteStorage {
    LocalFs(LocalFs),
    AwsS3(Arc<S3Bucket>),
+    AzureBlob(Arc<AzureBlobStorage>),
    Unreliable(Arc<UnreliableWrapper>),
 }

 impl GenericRemoteStorage {
+    pub async fn list(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+    ) -> anyhow::Result<Listing, DownloadError> {
+        match self {
+            Self::LocalFs(s) => s.list(prefix, mode).await,
+            Self::AwsS3(s) => s.list(prefix, mode).await,
+            Self::AzureBlob(s) => s.list(prefix, mode).await,
+            Self::Unreliable(s) => s.list(prefix, mode).await,
+        }
+    }
+
    // A function for listing all the files in a "directory"
    // Example:
    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
@@ -228,6 +288,7 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.list_files(folder).await,
            Self::AwsS3(s) => s.list_files(folder).await,
+            Self::AzureBlob(s) => s.list_files(folder).await,
            Self::Unreliable(s) => s.list_files(folder).await,
        }
    }
@@ -242,6 +303,7 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.list_prefixes(prefix).await,
            Self::AwsS3(s) => s.list_prefixes(prefix).await,
+            Self::AzureBlob(s) => s.list_prefixes(prefix).await,
            Self::Unreliable(s) => s.list_prefixes(prefix).await,
        }
    }
@@ -256,6 +318,7 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata).await,
            Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata).await,
+            Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata).await,
            Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata).await,
        }
    }
@@ -264,6 +327,7 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.download(from).await,
            Self::AwsS3(s) => s.download(from).await,
+            Self::AzureBlob(s) => s.download(from).await,
            Self::Unreliable(s) => s.download(from).await,
        }
    }
@@ -283,6 +347,10 @@ impl GenericRemoteStorage {
                s.download_byte_range(from, start_inclusive, end_exclusive)
                    .await
            }
+            Self::AzureBlob(s) => {
+                s.download_byte_range(from, start_inclusive, end_exclusive)
+                    .await
+            }
            Self::Unreliable(s) => {
                s.download_byte_range(from, start_inclusive, end_exclusive)
                    .await
@@ -294,6 +362,7 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.delete(path).await,
            Self::AwsS3(s) => s.delete(path).await,
+            Self::AzureBlob(s) => s.delete(path).await,
            Self::Unreliable(s) => s.delete(path).await,
        }
    }
@@ -302,6 +371,7 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.delete_objects(paths).await,
            Self::AwsS3(s) => s.delete_objects(paths).await,
+            Self::AzureBlob(s) => s.delete_objects(paths).await,
            Self::Unreliable(s) => s.delete_objects(paths).await,
        }
    }
@@ -319,6 +389,11 @@ impl GenericRemoteStorage {
                      s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
                Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
            }
+            RemoteStorageKind::AzureContainer(azure_config) => {
+                info!("Using azure container '{}' in region '{}' as a remote storage, prefix in container: '{:?}'",
+                      azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container);
+                Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config)?))
+            }
        })
    }

@@ -383,6 +458,9 @@ pub enum RemoteStorageKind {
    /// AWS S3 based storage, storing all files in the S3 bucket
    /// specified by the config
    AwsS3(S3Config),
+    /// Azure Blob based storage, storing all files in the container
+    /// specified by the config
+    AzureContainer(AzureConfig),
 }

 /// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
@@ -422,11 +500,45 @@ impl Debug for S3Config {
    }
 }

+/// Azure  bucket coordinates and access credentials to manage the bucket contents (read and write).
+#[derive(Clone, PartialEq, Eq)]
+pub struct AzureConfig {
+    /// Name of the container to connect to.
+    pub container_name: String,
+    /// The region where the bucket is located at.
+    pub container_region: String,
+    /// A "subfolder" in the container, to use the same container separately by multiple remote storage users at once.
+    pub prefix_in_container: Option<String>,
+    /// Azure has various limits on its API calls, we need not to exceed those.
+    /// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details.
+    pub concurrency_limit: NonZeroUsize,
+    pub max_keys_per_list_response: Option<i32>,
+}
+
+impl Debug for AzureConfig {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("AzureConfig")
+            .field("bucket_name", &self.container_name)
+            .field("bucket_region", &self.container_region)
+            .field("prefix_in_bucket", &self.prefix_in_container)
+            .field("concurrency_limit", &self.concurrency_limit)
+            .field(
+                "max_keys_per_list_response",
+                &self.max_keys_per_list_response,
+            )
+            .finish()
+    }
+}
+
 impl RemoteStorageConfig {
    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
        let local_path = toml.get("local_path");
        let bucket_name = toml.get("bucket_name");
        let bucket_region = toml.get("bucket_region");
+        let container_name = toml.get("container_name");
+        let container_region = toml.get("container_region");
+
+        let use_azure = container_name.is_some() && container_region.is_some();

        let max_concurrent_syncs = NonZeroUsize::new(
            parse_optional_integer("max_concurrent_syncs", toml)?
@@ -440,9 +552,13 @@ impl RemoteStorageConfig {
        )
        .context("Failed to parse 'max_sync_errors' as a positive integer")?;

+        let default_concurrency_limit = if use_azure {
+            DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
+        } else {
+            DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
+        };
        let concurrency_limit = NonZeroUsize::new(
-            parse_optional_integer("concurrency_limit", toml)?
-                .unwrap_or(DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT),
+            parse_optional_integer("concurrency_limit", toml)?.unwrap_or(default_concurrency_limit),
        )
        .context("Failed to parse 'concurrency_limit' as a positive integer")?;

@@ -451,33 +567,70 @@ impl RemoteStorageConfig {
                .context("Failed to parse 'max_keys_per_list_response' as a positive integer")?
                .or(DEFAULT_MAX_KEYS_PER_LIST_RESPONSE);

-        let storage = match (local_path, bucket_name, bucket_region) {
+        let endpoint = toml
+            .get("endpoint")
+            .map(|endpoint| parse_toml_string("endpoint", endpoint))
+            .transpose()?;
+
+        let storage = match (
+            local_path,
+            bucket_name,
+            bucket_region,
+            container_name,
+            container_region,
+        ) {
            // no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled
-            (None, None, None) => return Ok(None),
-            (_, Some(_), None) => {
+            (None, None, None, None, None) => return Ok(None),
+            (_, Some(_), None, ..) => {
                bail!("'bucket_region' option is mandatory if 'bucket_name' is given ")
            }
-            (_, None, Some(_)) => {
+            (_, None, Some(_), ..) => {
                bail!("'bucket_name' option is mandatory if 'bucket_region' is given ")
            }
-            (None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config {
-                bucket_name: parse_toml_string("bucket_name", bucket_name)?,
-                bucket_region: parse_toml_string("bucket_region", bucket_region)?,
-                prefix_in_bucket: toml
-                    .get("prefix_in_bucket")
-                    .map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket))
-                    .transpose()?,
-                endpoint: toml
-                    .get("endpoint")
-                    .map(|endpoint| parse_toml_string("endpoint", endpoint))
-                    .transpose()?,
-                concurrency_limit,
-                max_keys_per_list_response,
-            }),
-            (Some(local_path), None, None) => RemoteStorageKind::LocalFs(Utf8PathBuf::from(
-                parse_toml_string("local_path", local_path)?,
-            )),
-            (Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"),
+            (None, Some(bucket_name), Some(bucket_region), ..) => {
+                RemoteStorageKind::AwsS3(S3Config {
+                    bucket_name: parse_toml_string("bucket_name", bucket_name)?,
+                    bucket_region: parse_toml_string("bucket_region", bucket_region)?,
+                    prefix_in_bucket: toml
+                        .get("prefix_in_bucket")
+                        .map(|prefix_in_bucket| {
+                            parse_toml_string("prefix_in_bucket", prefix_in_bucket)
+                        })
+                        .transpose()?,
+                    endpoint,
+                    concurrency_limit,
+                    max_keys_per_list_response,
+                })
+            }
+            (_, _, _, Some(_), None) => {
+                bail!("'container_name' option is mandatory if 'container_region' is given ")
+            }
+            (_, _, _, None, Some(_)) => {
+                bail!("'container_name' option is mandatory if 'container_region' is given ")
+            }
+            (None, None, None, Some(container_name), Some(container_region)) => {
+                RemoteStorageKind::AzureContainer(AzureConfig {
+                    container_name: parse_toml_string("container_name", container_name)?,
+                    container_region: parse_toml_string("container_region", container_region)?,
+                    prefix_in_container: toml
+                        .get("prefix_in_container")
+                        .map(|prefix_in_container| {
+                            parse_toml_string("prefix_in_container", prefix_in_container)
+                        })
+                        .transpose()?,
+                    concurrency_limit,
+                    max_keys_per_list_response,
+                })
+            }
+            (Some(local_path), None, None, None, None) => RemoteStorageKind::LocalFs(
+                Utf8PathBuf::from(parse_toml_string("local_path", local_path)?),
+            ),
+            (Some(_), Some(_), ..) => {
+                bail!("'local_path' and 'bucket_name' are mutually exclusive")
+            }
+            (Some(_), _, _, Some(_), Some(_)) => {
+                bail!("local_path and 'container_name' are mutually exclusive")
+            }
        };

        Ok(Some(RemoteStorageConfig {
@@ -513,6 +666,46 @@ fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result<String> {
    Ok(s.to_string())
 }

+struct ConcurrencyLimiter {
+    // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
+    // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
+    // The helps to ensure we don't exceed the thresholds.
+    write: Arc<Semaphore>,
+    read: Arc<Semaphore>,
+}
+
+impl ConcurrencyLimiter {
+    fn for_kind(&self, kind: RequestKind) -> &Arc<Semaphore> {
+        match kind {
+            RequestKind::Get => &self.read,
+            RequestKind::Put => &self.write,
+            RequestKind::List => &self.read,
+            RequestKind::Delete => &self.write,
+        }
+    }
+
+    async fn acquire(
+        &self,
+        kind: RequestKind,
+    ) -> Result<tokio::sync::SemaphorePermit<'_>, tokio::sync::AcquireError> {
+        self.for_kind(kind).acquire().await
+    }
+
+    async fn acquire_owned(
+        &self,
+        kind: RequestKind,
+    ) -> Result<tokio::sync::OwnedSemaphorePermit, tokio::sync::AcquireError> {
+        Arc::clone(self.for_kind(kind)).acquire_owned().await
+    }
+
+    fn new(limit: usize) -> ConcurrencyLimiter {
+        Self {
+            read: Arc::new(Semaphore::new(limit)),
+            write: Arc::new(Semaphore::new(limit)),
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -15,7 +15,7 @@ use tokio::{
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};

-use crate::{Download, DownloadError, RemotePath};
+use crate::{Download, DownloadError, Listing, ListingMode, RemotePath};

 use super::{RemoteStorage, StorageMetadata};

@@ -75,7 +75,7 @@ impl LocalFs {
    }

    #[cfg(test)]
-    async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
+    async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
        Ok(get_all_files(&self.storage_root, true)
            .await?
            .into_iter()
@@ -89,52 +89,10 @@ impl LocalFs {
            })
            .collect())
    }
-}
-
-#[async_trait::async_trait]
-impl RemoteStorage for LocalFs {
-    async fn list_prefixes(
-        &self,
-        prefix: Option<&RemotePath>,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        let path = match prefix {
-            Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
-            None => Cow::Borrowed(&self.storage_root),
-        };
-
-        let prefixes_to_filter = get_all_files(path.as_ref(), false)
-            .await
-            .map_err(DownloadError::Other)?;
-
-        let mut prefixes = Vec::with_capacity(prefixes_to_filter.len());
-
-        // filter out empty directories to mirror s3 behavior.
-        for prefix in prefixes_to_filter {
-            if prefix.is_dir()
-                && is_directory_empty(&prefix)
-                    .await
-                    .map_err(DownloadError::Other)?
-            {
-                continue;
-            }
-
-            prefixes.push(
-                prefix
-                    .strip_prefix(&self.storage_root)
-                    .context("Failed to strip prefix")
-                    .and_then(RemotePath::new)
-                    .expect(
-                        "We list files for storage root, hence should be able to remote the prefix",
-                    ),
-            )
-        }
-
-        Ok(prefixes)
-    }

    // recursively lists all files in a directory,
    // mirroring the `list_files` for `s3_bucket`
-    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+    async fn list_recursive(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
        let full_path = match folder {
            Some(folder) => folder.with_base(&self.storage_root),
            None => self.storage_root.clone(),
@@ -186,6 +144,70 @@ impl RemoteStorage for LocalFs {

        Ok(files)
    }
+}
+
+#[async_trait::async_trait]
+impl RemoteStorage for LocalFs {
+    async fn list(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+    ) -> Result<Listing, DownloadError> {
+        let mut result = Listing::default();
+
+        if let ListingMode::NoDelimiter = mode {
+            let keys = self
+                .list_recursive(prefix)
+                .await
+                .map_err(DownloadError::Other)?;
+
+            result.keys = keys
+                .into_iter()
+                .filter(|k| {
+                    let path = k.with_base(&self.storage_root);
+                    !path.is_dir()
+                })
+                .collect();
+
+            return Ok(result);
+        }
+
+        let path = match prefix {
+            Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
+            None => Cow::Borrowed(&self.storage_root),
+        };
+
+        let prefixes_to_filter = get_all_files(path.as_ref(), false)
+            .await
+            .map_err(DownloadError::Other)?;
+
+        // filter out empty directories to mirror s3 behavior.
+        for prefix in prefixes_to_filter {
+            if prefix.is_dir()
+                && is_directory_empty(&prefix)
+                    .await
+                    .map_err(DownloadError::Other)?
+            {
+                continue;
+            }
+
+            let stripped = prefix
+                .strip_prefix(&self.storage_root)
+                .context("Failed to strip prefix")
+                .and_then(RemotePath::new)
+                .expect(
+                    "We list files for storage root, hence should be able to remote the prefix",
+                );
+
+            if prefix.is_dir() {
+                result.prefixes.push(stripped);
+            } else {
+                result.keys.push(stripped);
+            }
+        }
+
+        Ok(result)
+    }

    async fn upload(
        &self,
@@ -479,7 +501,7 @@ mod fs_tests {

        let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?;
        assert_eq!(
-            storage.list().await?,
+            storage.list_all().await?,
            vec![target_path_1.clone()],
            "Should list a single file after first upload"
        );
@@ -667,7 +689,7 @@ mod fs_tests {
        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;

        storage.delete(&upload_target).await?;
-        assert!(storage.list().await?.is_empty());
+        assert!(storage.list_all().await?.is_empty());

        storage
            .delete(&upload_target)
@@ -725,6 +747,43 @@ mod fs_tests {
        Ok(())
    }

+    #[tokio::test]
+    async fn list() -> anyhow::Result<()> {
+        // No delimiter: should recursively list everything
+        let storage = create_storage()?;
+        let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?;
+        let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?;
+
+        let listing = storage.list(None, ListingMode::NoDelimiter).await?;
+        assert!(listing.prefixes.is_empty());
+        assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
+
+        // Delimiter: should only go one deep
+        let listing = storage.list(None, ListingMode::WithDelimiter).await?;
+
+        assert_eq!(
+            listing.prefixes,
+            [RemotePath::from_string("timelines").unwrap()].to_vec()
+        );
+        assert!(listing.keys.is_empty());
+
+        // Delimiter & prefix
+        let listing = storage
+            .list(
+                Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
+                ListingMode::WithDelimiter,
+            )
+            .await?;
+        assert_eq!(
+            listing.prefixes,
+            [RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
+                .to_vec()
+        );
+        assert_eq!(listing.keys, [uncle.clone()].to_vec());
+
+        Ok(())
+    }
+
    async fn upload_dummy_file(
        storage: &LocalFs,
        name: &str,
@@ -777,7 +836,7 @@ mod fs_tests {
    }

    async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<RemotePath>> {
-        let mut files = storage.list().await?;
+        let mut files = storage.list_all().await?;
        files.sort_by(|a, b| a.0.cmp(&b.0));
        Ok(files)
    }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,7 +4,7 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.

-use std::{borrow::Cow, sync::Arc};
+use std::borrow::Cow;

 use anyhow::Context;
 use aws_config::{
@@ -24,22 +24,20 @@ use aws_sdk_s3::{
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
 use scopeguard::ScopeGuard;
-use tokio::{
-    io::{self, AsyncRead},
-    sync::Semaphore,
-};
+use tokio::io::{self, AsyncRead};
 use tokio_util::io::ReaderStream;
 use tracing::debug;

 use super::StorageMetadata;
 use crate::{
-    Download, DownloadError, RemotePath, RemoteStorage, S3Config, MAX_KEYS_PER_DELETE,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
+    ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
+    S3Config, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 pub(super) mod metrics;

-use self::metrics::{AttemptOutcome, RequestKind};
+use self::metrics::AttemptOutcome;
+pub(super) use self::metrics::RequestKind;

 /// AWS S3 storage.
 pub struct S3Bucket {
@@ -50,46 +48,6 @@ pub struct S3Bucket {
    concurrency_limiter: ConcurrencyLimiter,
 }

-struct ConcurrencyLimiter {
-    // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
-    // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
-    // The helps to ensure we don't exceed the thresholds.
-    write: Arc<Semaphore>,
-    read: Arc<Semaphore>,
-}
-
-impl ConcurrencyLimiter {
-    fn for_kind(&self, kind: RequestKind) -> &Arc<Semaphore> {
-        match kind {
-            RequestKind::Get => &self.read,
-            RequestKind::Put => &self.write,
-            RequestKind::List => &self.read,
-            RequestKind::Delete => &self.write,
-        }
-    }
-
-    async fn acquire(
-        &self,
-        kind: RequestKind,
-    ) -> Result<tokio::sync::SemaphorePermit<'_>, tokio::sync::AcquireError> {
-        self.for_kind(kind).acquire().await
-    }
-
-    async fn acquire_owned(
-        &self,
-        kind: RequestKind,
-    ) -> Result<tokio::sync::OwnedSemaphorePermit, tokio::sync::AcquireError> {
-        Arc::clone(self.for_kind(kind)).acquire_owned().await
-    }
-
-    fn new(limit: usize) -> ConcurrencyLimiter {
-        Self {
-            read: Arc::new(Semaphore::new(limit)),
-            write: Arc::new(Semaphore::new(limit)),
-        }
-    }
-}
-
 #[derive(Default)]
 struct GetObjectRequest {
    bucket: String,
@@ -341,13 +299,13 @@ impl<S: AsyncRead> AsyncRead for TimedDownload<S> {

 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
-    /// See the doc for `RemoteStorage::list_prefixes`
-    /// Note: it wont include empty "directories"
-    async fn list_prefixes(
+    async fn list(
        &self,
        prefix: Option<&RemotePath>,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
+        mode: ListingMode,
+    ) -> Result<Listing, DownloadError> {
        let kind = RequestKind::List;
+        let mut result = Listing::default();

        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
@@ -356,28 +314,33 @@ impl RemoteStorage for S3Bucket {
            .map(|mut p| {
                // required to end with a separator
                // otherwise request will return only the entry of a prefix
-                if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                if matches!(mode, ListingMode::WithDelimiter)
+                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                {
                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
                }
                p
            });

-        let mut document_keys = Vec::new();
-
        let mut continuation_token = None;

        loop {
            let _guard = self.permit(kind).await;
            let started_at = start_measuring_requests(kind);

-            let fetch_response = self
+            let mut request = self
                .client
                .list_objects_v2()
                .bucket(self.bucket_name.clone())
                .set_prefix(list_prefix.clone())
                .set_continuation_token(continuation_token)
-                .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string())
-                .set_max_keys(self.max_keys_per_list_response)
+                .set_max_keys(self.max_keys_per_list_response);
+
+            if let ListingMode::WithDelimiter = mode {
+                request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
+            }
+
+            let response = request
                .send()
                .await
                .context("Failed to list S3 prefixes")
@@ -387,71 +350,35 @@ impl RemoteStorage for S3Bucket {

            metrics::BUCKET_METRICS
                .req_seconds
-                .observe_elapsed(kind, &fetch_response, started_at);
+                .observe_elapsed(kind, &response, started_at);

-            let fetch_response = fetch_response?;
+            let response = response?;

-            document_keys.extend(
-                fetch_response
-                    .common_prefixes
-                    .unwrap_or_default()
-                    .into_iter()
+            let keys = response.contents().unwrap_or_default();
+            let empty = Vec::new();
+            let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
+
+            tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
+
+            for object in keys {
+                let object_path = object.key().expect("response does not contain a key");
+                let remote_path = self.s3_object_to_relative_path(object_path);
+                result.keys.push(remote_path);
+            }
+
+            result.prefixes.extend(
+                prefixes
+                    .iter()
                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
            );

-            continuation_token = match fetch_response.next_continuation_token {
+            continuation_token = match response.next_continuation_token {
                Some(new_token) => Some(new_token),
                None => break,
            };
        }

-        Ok(document_keys)
-    }
-
-    /// See the doc for `RemoteStorage::list_files`
-    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        let kind = RequestKind::List;
-
-        let folder_name = folder
-            .map(|p| self.relative_path_to_s3_object(p))
-            .or_else(|| self.prefix_in_bucket.clone());
-
-        // AWS may need to break the response into several parts
-        let mut continuation_token = None;
-        let mut all_files = vec![];
-        loop {
-            let _guard = self.permit(kind).await;
-            let started_at = start_measuring_requests(kind);
-
-            let response = self
-                .client
-                .list_objects_v2()
-                .bucket(self.bucket_name.clone())
-                .set_prefix(folder_name.clone())
-                .set_continuation_token(continuation_token)
-                .set_max_keys(self.max_keys_per_list_response)
-                .send()
-                .await
-                .context("Failed to list files in S3 bucket");
-
-            let started_at = ScopeGuard::into_inner(started_at);
-            metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, &response, started_at);
-
-            let response = response?;
-
-            for object in response.contents().unwrap_or_default() {
-                let object_path = object.key().expect("response does not contain a key");
-                let remote_path = self.s3_object_to_relative_path(object_path);
-                all_files.push(remote_path);
-            }
-            match response.next_continuation_token {
-                Some(new_token) => continuation_token = Some(new_token),
-                None => break,
-            }
-        }
-        Ok(all_files)
+        Ok(result)
    }

    async fn upload(
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -6,7 +6,7 @@ use once_cell::sync::Lazy;
 pub(super) static BUCKET_METRICS: Lazy<BucketMetrics> = Lazy::new(Default::default);

 #[derive(Clone, Copy, Debug)]
-pub(super) enum RequestKind {
+pub(crate) enum RequestKind {
    Get = 0,
    Put = 1,
    Delete = 2,
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -5,7 +5,9 @@ use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::sync::Mutex;

-use crate::{Download, DownloadError, RemotePath, RemoteStorage, StorageMetadata};
+use crate::{
+    Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
+};

 pub struct UnreliableWrapper {
    inner: crate::GenericRemoteStorage,
@@ -95,6 +97,15 @@ impl RemoteStorage for UnreliableWrapper {
        self.inner.list_files(folder).await
    }

+    async fn list(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+    ) -> Result<Listing, DownloadError> {
+        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
+        self.inner.list(prefix, mode).await
+    }
+
    async fn upload(
        &self,
        data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -0,0 +1,625 @@
+use std::collections::HashSet;
+use std::env;
+use std::num::{NonZeroU32, NonZeroUsize};
+use std::ops::ControlFlow;
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::UNIX_EPOCH;
+
+use anyhow::Context;
+use camino::Utf8Path;
+use once_cell::sync::OnceCell;
+use remote_storage::{
+    AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
+};
+use test_context::{test_context, AsyncTestContext};
+use tokio::task::JoinSet;
+use tracing::{debug, error, info};
+
+static LOGGING_DONE: OnceCell<()> = OnceCell::new();
+
+const ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_AZURE_REMOTE_STORAGE";
+
+const BASE_PREFIX: &str = "test";
+
+/// Tests that the Azure client can list all prefixes, even if the response comes paginated and requires multiple HTTP queries.
+/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified.
+/// See the client creation in [`create_azure_client`] for details on the required env vars.
+/// If real Azure tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
+/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
+///
+/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_azure_data`]
+/// where
+/// * `random_prefix_part` is set for the entire Azure client during the Azure client creation in [`create_azure_client`], to avoid multiple test runs interference
+/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
+///
+/// Then, verifies that the client does return correct prefixes when queried:
+/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
+/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
+///
+/// With the real Azure enabled and `#[cfg(test)]` Rust configuration used, the Azure client test adds a `max-keys` param to limit the response keys.
+/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to Azure.
+///
+/// Lastly, the test attempts to clean up and remove all uploaded Azure files.
+/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
+#[test_context(MaybeEnabledAzureWithTestBlobs)]
+#[tokio::test]
+async fn azure_pagination_should_work(
+    ctx: &mut MaybeEnabledAzureWithTestBlobs,
+) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledAzureWithTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledAzureWithTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledAzureWithTestBlobs::UploadsFailed(e, _) => {
+            anyhow::bail!("Azure init failed: {e:?}")
+        }
+    };
+
+    let test_client = Arc::clone(&ctx.enabled.client);
+    let expected_remote_prefixes = ctx.remote_prefixes.clone();
+
+    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
+        .context("common_prefix construction")?;
+    let root_remote_prefixes = test_client
+        .list_prefixes(None)
+        .await
+        .context("client list root prefixes failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
+        "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
+    );
+
+    let nested_remote_prefixes = test_client
+        .list_prefixes(Some(&base_prefix))
+        .await
+        .context("client list nested prefixes failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    let remote_only_prefixes = nested_remote_prefixes
+        .difference(&expected_remote_prefixes)
+        .collect::<HashSet<_>>();
+    let missing_uploaded_prefixes = expected_remote_prefixes
+        .difference(&nested_remote_prefixes)
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
+        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
+    );
+
+    Ok(())
+}
+
+/// Tests that Azure client can list all files in a folder, even if the response comes paginated and requirees multiple Azure queries.
+/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified. Test will skip real code and pass if env vars not set.
+/// See `Azure_pagination_should_work` for more information.
+///
+/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_azure_data`]
+/// Then performs the following queries:
+///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
+///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
+#[test_context(MaybeEnabledAzureWithSimpleTestBlobs)]
+#[tokio::test]
+async fn azure_list_files_works(
+    ctx: &mut MaybeEnabledAzureWithSimpleTestBlobs,
+) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledAzureWithSimpleTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledAzureWithSimpleTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledAzureWithSimpleTestBlobs::UploadsFailed(e, _) => {
+            anyhow::bail!("Azure init failed: {e:?}")
+        }
+    };
+    let test_client = Arc::clone(&ctx.enabled.client);
+    let base_prefix =
+        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
+    let root_files = test_client
+        .list_files(None)
+        .await
+        .context("client list root files failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        root_files,
+        ctx.remote_blobs.clone(),
+        "remote storage list_files on root mismatches with the uploads."
+    );
+    let nested_remote_files = test_client
+        .list_files(Some(&base_prefix))
+        .await
+        .context("client list nested files failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    let trim_remote_blobs: HashSet<_> = ctx
+        .remote_blobs
+        .iter()
+        .map(|x| x.get_path())
+        .filter(|x| x.starts_with("folder1"))
+        .map(|x| RemotePath::new(x).expect("must be valid path"))
+        .collect();
+    assert_eq!(
+        nested_remote_files, trim_remote_blobs,
+        "remote storage list_files on subdirrectory mismatches with the uploads."
+    );
+    Ok(())
+}
+
+#[test_context(MaybeEnabledAzure)]
+#[tokio::test]
+async fn azure_delete_non_exising_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledAzure::Enabled(ctx) => ctx,
+        MaybeEnabledAzure::Disabled => return Ok(()),
+    };
+
+    let path = RemotePath::new(Utf8Path::new(
+        format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
+    ))
+    .with_context(|| "RemotePath conversion")?;
+
+    ctx.client.delete(&path).await.expect("should succeed");
+
+    Ok(())
+}
+
+#[test_context(MaybeEnabledAzure)]
+#[tokio::test]
+async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledAzure::Enabled(ctx) => ctx,
+        MaybeEnabledAzure::Disabled => return Ok(()),
+    };
+
+    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let data1 = "remote blob data1".as_bytes();
+    let data1_len = data1.len();
+    let data2 = "remote blob data2".as_bytes();
+    let data2_len = data2.len();
+    let data3 = "remote blob data3".as_bytes();
+    let data3_len = data3.len();
+    ctx.client
+        .upload(std::io::Cursor::new(data1), data1_len, &path1, None)
+        .await?;
+
+    ctx.client
+        .upload(std::io::Cursor::new(data2), data2_len, &path2, None)
+        .await?;
+
+    ctx.client
+        .upload(std::io::Cursor::new(data3), data3_len, &path3, None)
+        .await?;
+
+    ctx.client.delete_objects(&[path1, path2]).await?;
+
+    let prefixes = ctx.client.list_prefixes(None).await?;
+
+    assert_eq!(prefixes.len(), 1);
+
+    ctx.client.delete_objects(&[path3]).await?;
+
+    Ok(())
+}
+
+#[test_context(MaybeEnabledAzure)]
+#[tokio::test]
+async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
+    let MaybeEnabledAzure::Enabled(ctx) = ctx else {
+        return Ok(());
+    };
+
+    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let data = "remote blob data here".as_bytes();
+    let data_len = data.len() as u64;
+
+    ctx.client
+        .upload(std::io::Cursor::new(data), data.len(), &path, None)
+        .await?;
+
+    async fn download_and_compare(mut dl: Download) -> anyhow::Result<Vec<u8>> {
+        let mut buf = Vec::new();
+        tokio::io::copy(&mut dl.download_stream, &mut buf).await?;
+        Ok(buf)
+    }
+    // Normal download request
+    let dl = ctx.client.download(&path).await?;
+    let buf = download_and_compare(dl).await?;
+    assert_eq!(buf, data);
+
+    // Full range (end specified)
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 0, Some(data_len))
+        .await?;
+    let buf = download_and_compare(dl).await?;
+    assert_eq!(buf, data);
+
+    // partial range (end specified)
+    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
+    let buf = download_and_compare(dl).await?;
+    assert_eq!(buf, data[4..10]);
+
+    // partial range (end beyond real end)
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 8, Some(data_len * 100))
+        .await?;
+    let buf = download_and_compare(dl).await?;
+    assert_eq!(buf, data[8..]);
+
+    // Partial range (end unspecified)
+    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
+    let buf = download_and_compare(dl).await?;
+    assert_eq!(buf, data[4..]);
+
+    // Full range (end unspecified)
+    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
+    let buf = download_and_compare(dl).await?;
+    assert_eq!(buf, data);
+
+    debug!("Cleanup: deleting file at path {path:?}");
+    ctx.client
+        .delete(&path)
+        .await
+        .with_context(|| format!("{path:?} removal"))?;
+
+    Ok(())
+}
+
+fn ensure_logging_ready() {
+    LOGGING_DONE.get_or_init(|| {
+        utils::logging::init(
+            utils::logging::LogFormat::Test,
+            utils::logging::TracingErrorLayerEnablement::Disabled,
+        )
+        .expect("logging init failed");
+    });
+}
+
+struct EnabledAzure {
+    client: Arc<GenericRemoteStorage>,
+    base_prefix: &'static str,
+}
+
+impl EnabledAzure {
+    async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
+        let client = create_azure_client(max_keys_in_list_response)
+            .context("Azure client creation")
+            .expect("Azure client creation failed");
+
+        EnabledAzure {
+            client,
+            base_prefix: BASE_PREFIX,
+        }
+    }
+}
+
+enum MaybeEnabledAzure {
+    Enabled(EnabledAzure),
+    Disabled,
+}
+
+#[async_trait::async_trait]
+impl AsyncTestContext for MaybeEnabledAzure {
+    async fn setup() -> Self {
+        ensure_logging_ready();
+
+        if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
+            info!(
+                "`{}` env variable is not set, skipping the test",
+                ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME
+            );
+            return Self::Disabled;
+        }
+
+        Self::Enabled(EnabledAzure::setup(None).await)
+    }
+}
+
+enum MaybeEnabledAzureWithTestBlobs {
+    Enabled(AzureWithTestBlobs),
+    Disabled,
+    UploadsFailed(anyhow::Error, AzureWithTestBlobs),
+}
+
+struct AzureWithTestBlobs {
+    enabled: EnabledAzure,
+    remote_prefixes: HashSet<RemotePath>,
+    remote_blobs: HashSet<RemotePath>,
+}
+
+#[async_trait::async_trait]
+impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs {
+    async fn setup() -> Self {
+        ensure_logging_ready();
+        if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
+            info!(
+                "`{}` env variable is not set, skipping the test",
+                ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME
+            );
+            return Self::Disabled;
+        }
+
+        let max_keys_in_list_response = 10;
+        let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
+
+        let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;
+
+        match upload_azure_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
+            ControlFlow::Continue(uploads) => {
+                info!("Remote objects created successfully");
+
+                Self::Enabled(AzureWithTestBlobs {
+                    enabled,
+                    remote_prefixes: uploads.prefixes,
+                    remote_blobs: uploads.blobs,
+                })
+            }
+            ControlFlow::Break(uploads) => Self::UploadsFailed(
+                anyhow::anyhow!("One or multiple blobs failed to upload to Azure"),
+                AzureWithTestBlobs {
+                    enabled,
+                    remote_prefixes: uploads.prefixes,
+                    remote_blobs: uploads.blobs,
+                },
+            ),
+        }
+    }
+
+    async fn teardown(self) {
+        match self {
+            Self::Disabled => {}
+            Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
+                cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
+            }
+        }
+    }
+}
+
+// NOTE: the setups for the list_prefixes test and the list_files test are very similar
+// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
+// whereas the list_files function is concerned with listing files.
+// See `RemoteStorage::list_files` documentation for more details
+enum MaybeEnabledAzureWithSimpleTestBlobs {
+    Enabled(AzureWithSimpleTestBlobs),
+    Disabled,
+    UploadsFailed(anyhow::Error, AzureWithSimpleTestBlobs),
+}
+struct AzureWithSimpleTestBlobs {
+    enabled: EnabledAzure,
+    remote_blobs: HashSet<RemotePath>,
+}
+
+#[async_trait::async_trait]
+impl AsyncTestContext for MaybeEnabledAzureWithSimpleTestBlobs {
+    async fn setup() -> Self {
+        ensure_logging_ready();
+        if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
+            info!(
+                "`{}` env variable is not set, skipping the test",
+                ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME
+            );
+            return Self::Disabled;
+        }
+
+        let max_keys_in_list_response = 10;
+        let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
+
+        let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;
+
+        match upload_simple_azure_data(&enabled.client, upload_tasks_count).await {
+            ControlFlow::Continue(uploads) => {
+                info!("Remote objects created successfully");
+
+                Self::Enabled(AzureWithSimpleTestBlobs {
+                    enabled,
+                    remote_blobs: uploads,
+                })
+            }
+            ControlFlow::Break(uploads) => Self::UploadsFailed(
+                anyhow::anyhow!("One or multiple blobs failed to upload to Azure"),
+                AzureWithSimpleTestBlobs {
+                    enabled,
+                    remote_blobs: uploads,
+                },
+            ),
+        }
+    }
+
+    async fn teardown(self) {
+        match self {
+            Self::Disabled => {}
+            Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
+                cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
+            }
+        }
+    }
+}
+
+fn create_azure_client(
+    max_keys_per_list_response: Option<i32>,
+) -> anyhow::Result<Arc<GenericRemoteStorage>> {
+    use rand::Rng;
+
+    let remote_storage_azure_container = env::var("REMOTE_STORAGE_AZURE_CONTAINER").context(
+        "`REMOTE_STORAGE_AZURE_CONTAINER` env var is not set, but real Azure tests are enabled",
+    )?;
+    let remote_storage_azure_region = env::var("REMOTE_STORAGE_AZURE_REGION").context(
+        "`REMOTE_STORAGE_AZURE_REGION` env var is not set, but real Azure tests are enabled",
+    )?;
+
+    // due to how time works, we've had test runners use the same nanos as bucket prefixes.
+    // millis is just a debugging aid for easier finding the prefix later.
+    let millis = std::time::SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .context("random Azure test prefix part calculation")?
+        .as_millis();
+
+    // because nanos can be the same for two threads so can millis, add randomness
+    let random = rand::thread_rng().gen::<u32>();
+
+    let remote_storage_config = RemoteStorageConfig {
+        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
+        max_sync_errors: NonZeroU32::new(5).unwrap(),
+        storage: RemoteStorageKind::AzureContainer(AzureConfig {
+            container_name: remote_storage_azure_container,
+            container_region: remote_storage_azure_region,
+            prefix_in_container: Some(format!("test_{millis}_{random:08x}/")),
+            concurrency_limit: NonZeroUsize::new(100).unwrap(),
+            max_keys_per_list_response,
+        }),
+    };
+    Ok(Arc::new(
+        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
+    ))
+}
+
+struct Uploads {
+    prefixes: HashSet<RemotePath>,
+    blobs: HashSet<RemotePath>,
+}
+
+async fn upload_azure_data(
+    client: &Arc<GenericRemoteStorage>,
+    base_prefix_str: &'static str,
+    upload_tasks_count: usize,
+) -> ControlFlow<Uploads, Uploads> {
+    info!("Creating {upload_tasks_count} Azure files");
+    let mut upload_tasks = JoinSet::new();
+    for i in 1..upload_tasks_count + 1 {
+        let task_client = Arc::clone(client);
+        upload_tasks.spawn(async move {
+            let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
+            let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
+                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
+            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
+            debug!("Creating remote item {i} at path {blob_path:?}");
+
+            let data = format!("remote blob data {i}").into_bytes();
+            let data_len = data.len();
+            task_client
+                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
+                .await?;
+
+            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
+        });
+    }
+
+    let mut upload_tasks_failed = false;
+    let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
+    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
+    while let Some(task_run_result) = upload_tasks.join_next().await {
+        match task_run_result
+            .context("task join failed")
+            .and_then(|task_result| task_result.context("upload task failed"))
+        {
+            Ok((upload_prefix, upload_path)) => {
+                uploaded_prefixes.insert(upload_prefix);
+                uploaded_blobs.insert(upload_path);
+            }
+            Err(e) => {
+                error!("Upload task failed: {e:?}");
+                upload_tasks_failed = true;
+            }
+        }
+    }
+
+    let uploads = Uploads {
+        prefixes: uploaded_prefixes,
+        blobs: uploaded_blobs,
+    };
+    if upload_tasks_failed {
+        ControlFlow::Break(uploads)
+    } else {
+        ControlFlow::Continue(uploads)
+    }
+}
+
+async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
+    info!(
+        "Removing {} objects from the remote storage during cleanup",
+        objects_to_delete.len()
+    );
+    let mut delete_tasks = JoinSet::new();
+    for object_to_delete in objects_to_delete {
+        let task_client = Arc::clone(client);
+        delete_tasks.spawn(async move {
+            debug!("Deleting remote item at path {object_to_delete:?}");
+            task_client
+                .delete(&object_to_delete)
+                .await
+                .with_context(|| format!("{object_to_delete:?} removal"))
+        });
+    }
+
+    while let Some(task_run_result) = delete_tasks.join_next().await {
+        match task_run_result {
+            Ok(task_result) => match task_result {
+                Ok(()) => {}
+                Err(e) => error!("Delete task failed: {e:?}"),
+            },
+            Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
+        }
+    }
+}
+
+// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
+async fn upload_simple_azure_data(
+    client: &Arc<GenericRemoteStorage>,
+    upload_tasks_count: usize,
+) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
+    info!("Creating {upload_tasks_count} Azure files");
+    let mut upload_tasks = JoinSet::new();
+    for i in 1..upload_tasks_count + 1 {
+        let task_client = Arc::clone(client);
+        upload_tasks.spawn(async move {
+            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
+            let blob_path = RemotePath::new(
+                Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
+            )
+            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
+            debug!("Creating remote item {i} at path {blob_path:?}");
+
+            let data = format!("remote blob data {i}").into_bytes();
+            let data_len = data.len();
+            task_client
+                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
+                .await?;
+
+            Ok::<_, anyhow::Error>(blob_path)
+        });
+    }
+
+    let mut upload_tasks_failed = false;
+    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
+    while let Some(task_run_result) = upload_tasks.join_next().await {
+        match task_run_result
+            .context("task join failed")
+            .and_then(|task_result| task_result.context("upload task failed"))
+        {
+            Ok(upload_path) => {
+                uploaded_blobs.insert(upload_path);
+            }
+            Err(e) => {
+                error!("Upload task failed: {e:?}");
+                upload_tasks_failed = true;
+            }
+        }
+    }
+
+    if upload_tasks_failed {
+        ControlFlow::Break(uploaded_blobs)
+    } else {
+        ControlFlow::Continue(uploaded_blobs)
+    }
+}
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -73,6 +73,8 @@ pub mod completion;
 /// Reporting utilities
 pub mod error;

+pub mod sync;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
@@ -128,6 +130,21 @@ macro_rules! project_git_version {
    };
 }

+/// This is a shortcut to embed build tag into binaries and avoid copying the same build script to all packages
+#[macro_export]
+macro_rules! project_build_tag {
+    ($const_identifier:ident) => {
+        const $const_identifier: &::core::primitive::str = {
+            const __ARG: &[&::core::primitive::str; 2] = &match ::core::option_env!("BUILD_TAG") {
+                ::core::option::Option::Some(x) => ["build_tag-env:", x],
+                ::core::option::Option::None => ["build_tag:", ""],
+            };
+
+            $crate::__const_format::concatcp!(__ARG[0], __ARG[1])
+        };
+    };
+}
+
 /// Re-export for `project_git_version` macro
 #[doc(hidden)]
 pub use const_format as __const_format;
--- a/libs/utils/src/sync.rs
+++ b/libs/utils/src/sync.rs
@@ -0,0 +1 @@
+pub mod heavier_once_cell;
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -0,0 +1,350 @@
+use std::sync::{Arc, Mutex, MutexGuard};
+use tokio::sync::Semaphore;
+
+/// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
+/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
+/// for the duration of initialization.
+///
+/// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
+///
+/// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
+pub struct OnceCell<T> {
+    inner: Mutex<Inner<T>>,
+}
+
+impl<T> Default for OnceCell<T> {
+    /// Create new uninitialized [`OnceCell`].
+    fn default() -> Self {
+        Self {
+            inner: Default::default(),
+        }
+    }
+}
+
+/// Semaphore is the current state:
+/// - open semaphore means the value is `None`, not yet initialized
+/// - closed semaphore means the value has been initialized
+#[derive(Debug)]
+struct Inner<T> {
+    init_semaphore: Arc<Semaphore>,
+    value: Option<T>,
+}
+
+impl<T> Default for Inner<T> {
+    fn default() -> Self {
+        Self {
+            init_semaphore: Arc::new(Semaphore::new(1)),
+            value: None,
+        }
+    }
+}
+
+impl<T> OnceCell<T> {
+    /// Creates an already initialized `OnceCell` with the given value.
+    pub fn new(value: T) -> Self {
+        let sem = Semaphore::new(1);
+        sem.close();
+        Self {
+            inner: Mutex::new(Inner {
+                init_semaphore: Arc::new(sem),
+                value: Some(value),
+            }),
+        }
+    }
+
+    /// Returns a guard to an existing initialized value, or uniquely initializes the value before
+    /// returning the guard.
+    ///
+    /// Initializing might wait on any existing [`Guard::take_and_deinit`] deinitialization.
+    ///
+    /// Initialization is panic-safe and cancellation-safe.
+    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
+    where
+        F: FnOnce(InitPermit) -> Fut,
+        Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
+    {
+        let sem = {
+            let guard = self.inner.lock().unwrap();
+            if guard.value.is_some() {
+                return Ok(Guard(guard));
+            }
+            guard.init_semaphore.clone()
+        };
+
+        let permit = sem.acquire_owned().await;
+
+        match permit {
+            Ok(permit) => {
+                let permit = InitPermit(permit);
+                let (value, _permit) = factory(permit).await?;
+
+                let guard = self.inner.lock().unwrap();
+
+                Ok(Self::set0(value, guard))
+            }
+            Err(_closed) => {
+                let guard = self.inner.lock().unwrap();
+                assert!(
+                    guard.value.is_some(),
+                    "semaphore got closed, must be initialized"
+                );
+                return Ok(Guard(guard));
+            }
+        }
+    }
+
+    /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
+    /// to complete initializing the inner value.
+    ///
+    /// # Panics
+    ///
+    /// If the inner has already been initialized.
+    pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
+        // cannot assert that this permit is for self.inner.semaphore
+        let guard = self.inner.lock().unwrap();
+
+        if guard.init_semaphore.try_acquire().is_ok() {
+            drop(guard);
+            panic!("semaphore is of wrong origin");
+        }
+
+        Self::set0(value, guard)
+    }
+
+    fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
+        if guard.value.is_some() {
+            drop(guard);
+            unreachable!("we won permit, must not be initialized");
+        }
+        guard.value = Some(value);
+        guard.init_semaphore.close();
+        Guard(guard)
+    }
+
+    /// Returns a guard to an existing initialized value, if any.
+    pub fn get(&self) -> Option<Guard<'_, T>> {
+        let guard = self.inner.lock().unwrap();
+        if guard.value.is_some() {
+            Some(Guard(guard))
+        } else {
+            None
+        }
+    }
+}
+
+/// Uninteresting guard object to allow short-lived access to inspect or clone the held,
+/// initialized value.
+#[derive(Debug)]
+pub struct Guard<'a, T>(MutexGuard<'a, Inner<T>>);
+
+impl<T> std::ops::Deref for Guard<'_, T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.0
+            .value
+            .as_ref()
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+impl<T> std::ops::DerefMut for Guard<'_, T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.0
+            .value
+            .as_mut()
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+impl<'a, T> Guard<'a, T> {
+    /// Take the current value, and a new permit for it's deinitialization.
+    ///
+    /// The permit will be on a semaphore part of the new internal value, and any following
+    /// [`OnceCell::get_or_init`] will wait on it to complete.
+    pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
+        let mut swapped = Inner::default();
+        let permit = swapped
+            .init_semaphore
+            .clone()
+            .try_acquire_owned()
+            .expect("we just created this");
+        std::mem::swap(&mut *self.0, &mut swapped);
+        swapped
+            .value
+            .map(|v| (v, InitPermit(permit)))
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+/// Type held by OnceCell (de)initializing task.
+pub struct InitPermit(tokio::sync::OwnedSemaphorePermit);
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::{
+        convert::Infallible,
+        sync::atomic::{AtomicUsize, Ordering},
+        time::Duration,
+    };
+
+    #[tokio::test]
+    async fn many_initializers() {
+        #[derive(Default, Debug)]
+        struct Counters {
+            factory_got_to_run: AtomicUsize,
+            future_polled: AtomicUsize,
+            winners: AtomicUsize,
+        }
+
+        let initializers = 100;
+
+        let cell = Arc::new(OnceCell::default());
+        let counters = Arc::new(Counters::default());
+        let barrier = Arc::new(tokio::sync::Barrier::new(initializers + 1));
+
+        let mut js = tokio::task::JoinSet::new();
+        for i in 0..initializers {
+            js.spawn({
+                let cell = cell.clone();
+                let counters = counters.clone();
+                let barrier = barrier.clone();
+
+                async move {
+                    barrier.wait().await;
+                    let won = {
+                        let g = cell
+                            .get_or_init(|permit| {
+                                counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
+                                async {
+                                    counters.future_polled.fetch_add(1, Ordering::Relaxed);
+                                    Ok::<_, Infallible>((i, permit))
+                                }
+                            })
+                            .await
+                            .unwrap();
+
+                        *g == i
+                    };
+
+                    if won {
+                        counters.winners.fetch_add(1, Ordering::Relaxed);
+                    }
+                }
+            });
+        }
+
+        barrier.wait().await;
+
+        while let Some(next) = js.join_next().await {
+            next.expect("no panics expected");
+        }
+
+        let mut counters = Arc::try_unwrap(counters).unwrap();
+
+        assert_eq!(*counters.factory_got_to_run.get_mut(), 1);
+        assert_eq!(*counters.future_polled.get_mut(), 1);
+        assert_eq!(*counters.winners.get_mut(), 1);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn reinit_waits_for_deinit() {
+        // with the tokio::time paused, we will "sleep" for 1s while holding the reinitialization
+        let sleep_for = Duration::from_secs(1);
+        let initial = 42;
+        let reinit = 1;
+        let cell = Arc::new(OnceCell::new(initial));
+
+        let deinitialization_started = Arc::new(tokio::sync::Barrier::new(2));
+
+        let jh = tokio::spawn({
+            let cell = cell.clone();
+            let deinitialization_started = deinitialization_started.clone();
+            async move {
+                let (answer, _permit) = cell.get().expect("initialized to value").take_and_deinit();
+                assert_eq!(answer, initial);
+
+                deinitialization_started.wait().await;
+                tokio::time::sleep(sleep_for).await;
+            }
+        });
+
+        deinitialization_started.wait().await;
+
+        let started_at = tokio::time::Instant::now();
+        cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
+            .await
+            .unwrap();
+
+        let elapsed = started_at.elapsed();
+        assert!(
+            elapsed >= sleep_for,
+            "initialization should had taken at least the time time slept with permit"
+        );
+
+        jh.await.unwrap();
+
+        assert_eq!(*cell.get().unwrap(), reinit);
+    }
+
+    #[test]
+    fn reinit_with_deinit_permit() {
+        let cell = Arc::new(OnceCell::new(42));
+
+        let (mol, permit) = cell.get().unwrap().take_and_deinit();
+        cell.set(5, permit);
+        assert_eq!(*cell.get().unwrap(), 5);
+
+        let (five, permit) = cell.get().unwrap().take_and_deinit();
+        assert_eq!(5, five);
+        cell.set(mol, permit);
+        assert_eq!(*cell.get().unwrap(), 42);
+    }
+
+    #[tokio::test]
+    async fn initialization_attemptable_until_ok() {
+        let cell = OnceCell::default();
+
+        for _ in 0..10 {
+            cell.get_or_init(|_permit| async { Err("whatever error") })
+                .await
+                .unwrap_err();
+        }
+
+        let g = cell
+            .get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
+            .await
+            .unwrap();
+        assert_eq!(*g, "finally success");
+    }
+
+    #[tokio::test]
+    async fn initialization_is_cancellation_safe() {
+        let cell = OnceCell::default();
+
+        let barrier = tokio::sync::Barrier::new(2);
+
+        let initializer = cell.get_or_init(|permit| async {
+            barrier.wait().await;
+            futures::future::pending::<()>().await;
+
+            Ok::<_, Infallible>(("never reached", permit))
+        });
+
+        tokio::select! {
+            _ = initializer => { unreachable!("cannot complete; stuck in pending().await") },
+            _ = barrier.wait() => {}
+        };
+
+        // now initializer is dropped
+
+        assert!(cell.get().is_none());
+
+        let g = cell
+            .get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
+            .await
+            .unwrap();
+        assert_eq!(*g, "now initialized");
+    }
+}
--- a/libs/vm_monitor/README.md
+++ b/libs/vm_monitor/README.md
@@ -27,8 +27,8 @@ and old one if it exists.
 * the filecache: a struct that allows communication with the Postgres file cache.
 On startup, we connect to the filecache and hold on to the connection for the
 entire monitor lifetime.
-* the cgroup watcher: the `CgroupWatcher` manages the `neon-postgres` cgroup by
-listening for `memory.high` events and setting its `memory.{high,max}` values.
+* the cgroup watcher: the `CgroupWatcher` polls the `neon-postgres` cgroup's memory
+usage and sends rolling aggregates to the runner.
 * the runner: the runner marries the filecache and cgroup watcher together,
 communicating with the agent throught the `Dispatcher`, and then calling filecache
 and cgroup watcher functions as needed to upscale and downscale
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -1,161 +1,38 @@
-use std::{
-    fmt::{Debug, Display},
-    fs,
-    pin::pin,
-    sync::atomic::{AtomicU64, Ordering},
-};
+use std::fmt::{self, Debug, Formatter};
+use std::time::{Duration, Instant};

-use anyhow::{anyhow, bail, Context};
+use anyhow::{anyhow, Context};
 use cgroups_rs::{
-    freezer::FreezerController,
-    hierarchies::{self, is_cgroup2_unified_mode, UNIFIED_MOUNTPOINT},
+    hierarchies::{self, is_cgroup2_unified_mode},
    memory::MemController,
-    MaxValue,
-    Subsystem::{Freezer, Mem},
+    Subsystem,
 };
-use inotify::{EventStream, Inotify, WatchMask};
-use tokio::sync::mpsc::{self, error::TryRecvError};
-use tokio::time::{Duration, Instant};
-use tokio_stream::{Stream, StreamExt};
+use tokio::sync::watch;
 use tracing::{info, warn};

-use crate::protocol::Resources;
-use crate::MiB;
-
-/// Monotonically increasing counter of the number of memory.high events
-/// the cgroup has experienced.
-///
-/// We use this to determine if a modification to the `memory.events` file actually
-/// changed the `high` field. If not, we don't care about the change. When we
-/// read the file, we check the `high` field in the file against `MEMORY_EVENT_COUNT`
-/// to see if it changed since last time.
-pub static MEMORY_EVENT_COUNT: AtomicU64 = AtomicU64::new(0);
-
-/// Monotonically increasing counter that gives each cgroup event a unique id.
-///
-/// This allows us to answer questions like "did this upscale arrive before this
-/// memory.high?". This static is also used by the `Sequenced` type to "tag" values
-/// with a sequence number. As such, prefer to used the `Sequenced` type rather
-/// than this static directly.
-static EVENT_SEQUENCE_NUMBER: AtomicU64 = AtomicU64::new(0);
-
-/// A memory event type reported in memory.events.
-#[derive(Debug, Eq, PartialEq, Copy, Clone)]
-pub enum MemoryEvent {
-    Low,
-    High,
-    Max,
-    Oom,
-    OomKill,
-    OomGroupKill,
-}
-
-impl MemoryEvent {
-    fn as_str(&self) -> &str {
-        match self {
-            MemoryEvent::Low => "low",
-            MemoryEvent::High => "high",
-            MemoryEvent::Max => "max",
-            MemoryEvent::Oom => "oom",
-            MemoryEvent::OomKill => "oom_kill",
-            MemoryEvent::OomGroupKill => "oom_group_kill",
-        }
-    }
-}
-
-impl Display for MemoryEvent {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.write_str(self.as_str())
-    }
-}
-
 /// Configuration for a `CgroupWatcher`
 #[derive(Debug, Clone)]
 pub struct Config {
-    // The target difference between the total memory reserved for the cgroup
-    // and the value of the cgroup's memory.high.
-    //
-    // In other words, memory.high + oom_buffer_bytes will equal the total memory that the cgroup may
-    // use (equal to system memory, minus whatever's taken out for the file cache).
-    oom_buffer_bytes: u64,
+    /// Interval at which we should be fetching memory statistics
+    memory_poll_interval: Duration,

-    // The amount of memory, in bytes, below a proposed new value for
-    // memory.high that the cgroup's memory usage must be for us to downscale
-    //
-    // In other words, we can downscale only when:
-    //
-    //   memory.current + memory_high_buffer_bytes < (proposed) memory.high
-    //
-    // TODO: there's some minor issues with this approach -- in particular, that we might have
-    // memory in use by the kernel's page cache that we're actually ok with getting rid of.
-    pub(crate) memory_high_buffer_bytes: u64,
-
-    // The maximum duration, in milliseconds, that we're allowed to pause
-    // the cgroup for while waiting for the autoscaler-agent to upscale us
-    max_upscale_wait: Duration,
-
-    // The required minimum time, in milliseconds, that we must wait before re-freezing
-    // the cgroup while waiting for the autoscaler-agent to upscale us.
-    do_not_freeze_more_often_than: Duration,
-
-    // The amount of memory, in bytes, that we should periodically increase memory.high
-    // by while waiting for the autoscaler-agent to upscale us.
-    //
-    // This exists to avoid the excessive throttling that happens when a cgroup is above its
-    // memory.high for too long. See more here:
-    // https://github.com/neondatabase/autoscaling/issues/44#issuecomment-1522487217
-    memory_high_increase_by_bytes: u64,
-
-    // The period, in milliseconds, at which we should repeatedly increase the value
-    // of the cgroup's memory.high while we're waiting on upscaling and memory.high
-    // is still being hit.
-    //
-    // Technically speaking, this actually serves as a rate limit to moderate responding to
-    // memory.high events, but these are roughly equivalent if the process is still allocating
-    // memory.
-    memory_high_increase_every: Duration,
-}
-
-impl Config {
-    /// Calculate the new value for the cgroups memory.high based on system memory
-    pub fn calculate_memory_high_value(&self, total_system_mem: u64) -> u64 {
-        total_system_mem.saturating_sub(self.oom_buffer_bytes)
-    }
+    /// The number of samples used in constructing aggregated memory statistics
+    memory_history_len: usize,
+    /// The number of most recent samples that will be periodically logged.
+    ///
+    /// Each sample is logged exactly once. Increasing this value means that recent samples will be
+    /// logged less frequently, and vice versa.
+    ///
+    /// For simplicity, this value must be greater than or equal to `memory_history_len`.
+    memory_history_log_interval: usize,
 }

 impl Default for Config {
    fn default() -> Self {
        Self {
-            oom_buffer_bytes: 100 * MiB,
-            memory_high_buffer_bytes: 100 * MiB,
-            // while waiting for upscale, don't freeze for more than 20ms every 1s
-            max_upscale_wait: Duration::from_millis(20),
-            do_not_freeze_more_often_than: Duration::from_millis(1000),
-            // while waiting for upscale, increase memory.high by 10MiB every 25ms
-            memory_high_increase_by_bytes: 10 * MiB,
-            memory_high_increase_every: Duration::from_millis(25),
-        }
-    }
-}
-
-/// Used to represent data that is associated with a certain point in time, such
-/// as an upscale request or memory.high event.
-///
-/// Internally, creating a `Sequenced` uses a static atomic counter to obtain
-/// a unique sequence number. Sequence numbers are monotonically increasing,
-/// allowing us to answer questions like "did this upscale happen after this
-/// memory.high event?" by comparing the sequence numbers of the two events.
-#[derive(Debug, Clone)]
-pub struct Sequenced<T> {
-    seqnum: u64,
-    data: T,
-}
-
-impl<T> Sequenced<T> {
-    pub fn new(data: T) -> Self {
-        Self {
-            seqnum: EVENT_SEQUENCE_NUMBER.fetch_add(1, Ordering::AcqRel),
-            data,
+            memory_poll_interval: Duration::from_millis(100),
+            memory_history_len: 5, // use 500ms of history for decision-making
+            memory_history_log_interval: 20, // but only log every ~2s (otherwise it's spammy)
        }
    }
 }
@@ -170,74 +47,14 @@ impl<T> Sequenced<T> {
 pub struct CgroupWatcher {
    pub config: Config,

-    /// The sequence number of the last upscale.
-    ///
-    /// If we receive a memory.high event that has a _lower_ sequence number than
-    /// `last_upscale_seqnum`, then we know it occured before the upscale, and we
-    /// can safely ignore it.
-    ///
-    /// Note: Like the `events` field, this doesn't _need_ interior mutability but we
-    /// use it anyways so that methods take `&self`, not `&mut self`.
-    last_upscale_seqnum: AtomicU64,
-
-    /// A channel on which we send messages to request upscale from the dispatcher.
-    upscale_requester: mpsc::Sender<()>,
-
    /// The actual cgroup we are watching and managing.
    cgroup: cgroups_rs::Cgroup,
 }

-/// Read memory.events for the desired event type.
-///
-/// `path` specifies the path to the desired `memory.events` file.
-/// For more info, see the `memory.events` section of the [kernel docs]
-/// <https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files>
-fn get_event_count(path: &str, event: MemoryEvent) -> anyhow::Result<u64> {
-    let contents = fs::read_to_string(path)
-        .with_context(|| format!("failed to read memory.events from {path}"))?;
-
-    // Then contents of the file look like:
-    // low 42
-    // high 101
-    // ...
-    contents
-        .lines()
-        .filter_map(|s| s.split_once(' '))
-        .find(|(e, _)| *e == event.as_str())
-        .ok_or_else(|| anyhow!("failed to find entry for memory.{event} events in {path}"))
-        .and_then(|(_, count)| {
-            count
-                .parse::<u64>()
-                .with_context(|| format!("failed to parse memory.{event} as u64"))
-        })
-}
-
-/// Create an event stream that produces events whenever the file at the provided
-/// path is modified.
-fn create_file_watcher(path: &str) -> anyhow::Result<EventStream<[u8; 1024]>> {
-    info!("creating file watcher for {path}");
-    let inotify = Inotify::init().context("failed to initialize file watcher")?;
-    inotify
-        .watches()
-        .add(path, WatchMask::MODIFY)
-        .with_context(|| format!("failed to start watching {path}"))?;
-    inotify
-        // The inotify docs use [0u8; 1024] so we'll just copy them. We only need
-        // to store one event at a time - if the event gets written over, that's
-        // ok. We still see that there is an event. For more information, see:
-        // https://man7.org/linux/man-pages/man7/inotify.7.html
-        .into_event_stream([0u8; 1024])
-        .context("failed to start inotify event stream")
-}
-
 impl CgroupWatcher {
    /// Create a new `CgroupWatcher`.
    #[tracing::instrument(skip_all, fields(%name))]
-    pub fn new(
-        name: String,
-        // A channel on which to send upscale requests
-        upscale_requester: mpsc::Sender<()>,
-    ) -> anyhow::Result<(Self, impl Stream<Item = Sequenced<u64>>)> {
+    pub fn new(name: String) -> anyhow::Result<Self> {
        // TODO: clarify exactly why we need v2
        // Make sure cgroups v2 (aka unified) are supported
        if !is_cgroup2_unified_mode() {
@@ -245,410 +62,203 @@ impl CgroupWatcher {
        }
        let cgroup = cgroups_rs::Cgroup::load(hierarchies::auto(), &name);

-        // Start monitoring the cgroup for memory events. In general, for
-        // cgroups v2 (aka unified), metrics are reported in files like
-        // > `/sys/fs/cgroup/{name}/{metric}`
-        // We are looking for `memory.high` events, which are stored in the
-        // file `memory.events`. For more info, see the `memory.events` section
-        // of https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files
-        let path = format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name);
-        let memory_events = create_file_watcher(&path)
-            .with_context(|| format!("failed to create event watcher for {path}"))?
-            // This would be nice with with .inspect_err followed by .ok
-            .filter_map(move |_| match get_event_count(&path, MemoryEvent::High) {
-                Ok(high) => Some(high),
-                Err(error) => {
-                    // TODO: Might want to just panic here
-                    warn!(?error, "failed to read high events count from {}", &path);
-                    None
-                }
-            })
-            // Only report the event if the memory.high count increased
-            .filter_map(|high| {
-                if MEMORY_EVENT_COUNT.fetch_max(high, Ordering::AcqRel) < high {
-                    Some(high)
-                } else {
-                    None
-                }
-            })
-            .map(Sequenced::new);
-
-        let initial_count = get_event_count(
-            &format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name),
-            MemoryEvent::High,
-        )?;
-
-        info!(initial_count, "initial memory.high event count");
-
-        // Hard update `MEMORY_EVENT_COUNT` since there could have been processes
-        // running in the cgroup before that caused it to be non-zero.
-        MEMORY_EVENT_COUNT.fetch_max(initial_count, Ordering::AcqRel);
-
-        Ok((
-            Self {
-                cgroup,
-                upscale_requester,
-                last_upscale_seqnum: AtomicU64::new(0),
-                config: Default::default(),
-            },
-            memory_events,
-        ))
+        Ok(Self {
+            cgroup,
+            config: Default::default(),
+        })
    }

    /// The entrypoint for the `CgroupWatcher`.
    #[tracing::instrument(skip_all)]
-    pub async fn watch<E>(
+    pub async fn watch(
        &self,
-        // These are ~dependency injected~ (fancy, I know) because this function
-        // should never return.
-        // -> therefore: when we tokio::spawn it, we don't await the JoinHandle.
-        // -> therefore: if we want to stick it in an Arc so many threads can access
-        //    it, methods can never take mutable access.
-        //     - note: we use the Arc strategy so that a) we can call this function
-        //             right here and b) the runner can call the set/get_memory methods
-        // -> since calling recv() on a tokio::sync::mpsc::Receiver takes &mut self,
-        //    we just pass them in here instead of holding them in fields, as that
-        //    would require this method to take &mut self.
-        mut upscales: mpsc::Receiver<Sequenced<Resources>>,
-        events: E,
-    ) -> anyhow::Result<()>
-    where
-        E: Stream<Item = Sequenced<u64>>,
-    {
-        let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO));
-        let mut last_memory_high_increase_at: Option<Instant> = None;
-        let mut events = pin!(events);
-
-        // Are we waiting to be upscaled? Could be true if we request upscale due
-        // to a memory.high event and it does not arrive in time.
-        let mut waiting_on_upscale = false;
-
-        loop {
-            tokio::select! {
-                upscale = upscales.recv() => {
-                    let Sequenced { seqnum, data } = upscale
-                        .context("failed to listen on upscale notification channel")?;
-                    waiting_on_upscale = false;
-                    last_memory_high_increase_at = None;
-                    self.last_upscale_seqnum.store(seqnum, Ordering::Release);
-                    info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
-                }
-                event = events.next() => {
-                    let Some(Sequenced { seqnum, .. }) = event else {
-                        bail!("failed to listen for memory.high events")
-                    };
-                    // The memory.high came before our last upscale, so we consider
-                    // it resolved
-                    if self.last_upscale_seqnum.fetch_max(seqnum, Ordering::AcqRel) > seqnum {
-                        info!(
-                            "received memory.high event, but it came before our last upscale -> ignoring it"
-                        );
-                        continue;
-                    }
-
-                    // The memory.high came after our latest upscale. We don't
-                    // want to do anything yet, so peek the next event in hopes
-                    // that it's an upscale.
-                    if let Some(upscale_num) = self
-                        .upscaled(&mut upscales)
-                        .context("failed to check if we were upscaled")?
-                    {
-                        if upscale_num > seqnum {
-                            info!(
-                                "received memory.high event, but it came before our last upscale -> ignoring it"
-                            );
-                            continue;
-                        }
-                    }
-
-                    // If it's been long enough since we last froze, freeze the
-                    // cgroup and request upscale
-                    if wait_to_freeze.is_elapsed() {
-                        info!("received memory.high event -> requesting upscale");
-                        waiting_on_upscale = self
-                            .handle_memory_high_event(&mut upscales)
-                            .await
-                            .context("failed to handle upscale")?;
-                        wait_to_freeze
-                            .as_mut()
-                            .reset(Instant::now() + self.config.do_not_freeze_more_often_than);
-                        continue;
-                    }
-
-                    // Ok, we can't freeze, just request upscale
-                    if !waiting_on_upscale {
-                        info!("received memory.high event, but too soon to refreeze -> requesting upscale");
-
-                        // Make check to make sure we haven't been upscaled in the
-                        // meantine (can happen if the agent independently decides
-                        // to upscale us again)
-                        if self
-                            .upscaled(&mut upscales)
-                            .context("failed to check if we were upscaled")?
-                            .is_some()
-                        {
-                            info!("no need to request upscaling because we got upscaled");
-                            continue;
-                        }
-                        self.upscale_requester
-                            .send(())
-                            .await
-                            .context("failed to request upscale")?;
-                        waiting_on_upscale = true;
-                        continue;
-                    }
-
-                    // Shoot, we can't freeze or and we're still waiting on upscale,
-                    // increase memory.high to reduce throttling
-                    let can_increase_memory_high = match last_memory_high_increase_at {
-                        None => true,
-                        Some(t) => t.elapsed() > self.config.memory_high_increase_every,
-                    };
-                    if can_increase_memory_high {
-                        info!(
-                            "received memory.high event, \
-                            but too soon to refreeze and already requested upscale \
-                            -> increasing memory.high"
-                        );
-
-                        // Make check to make sure we haven't been upscaled in the
-                        // meantine (can happen if the agent independently decides
-                        // to upscale us again)
-                        if self
-                            .upscaled(&mut upscales)
-                            .context("failed to check if we were upscaled")?
-                            .is_some()
-                        {
-                            info!("no need to increase memory.high because got upscaled");
-                            continue;
-                        }
-
-                        // Request upscale anyways (the agent will handle deduplicating
-                        // requests)
-                        self.upscale_requester
-                            .send(())
-                            .await
-                            .context("failed to request upscale")?;
-
-                        let memory_high =
-                            self.get_memory_high_bytes().context("failed to get memory.high")?;
-                        let new_high = memory_high + self.config.memory_high_increase_by_bytes;
-                        info!(
-                            current_high_bytes = memory_high,
-                            new_high_bytes = new_high,
-                            "updating memory.high"
-                        );
-                        self.set_memory_high_bytes(new_high)
-                            .context("failed to set memory.high")?;
-                        last_memory_high_increase_at = Some(Instant::now());
-                        continue;
-                    }
-
-                    info!("received memory.high event, but can't do anything");
-                }
-            };
-        }
-    }
-
-    /// Handle a `memory.high`, returning whether we are still waiting on upscale
-    /// by the time the function returns.
-    ///
-    /// The general plan for handling a `memory.high` event is as follows:
-    /// 1. Freeze the cgroup
-    /// 2. Start a timer for `self.config.max_upscale_wait`
-    /// 3. Request upscale
-    /// 4. After the timer elapses or we receive upscale, thaw the cgroup.
-    /// 5. Return whether or not we are still waiting for upscale. If we are,
-    ///    we'll increase the cgroups memory.high to avoid getting oom killed
-    #[tracing::instrument(skip_all)]
-    async fn handle_memory_high_event(
-        &self,
-        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
-    ) -> anyhow::Result<bool> {
-        // Immediately freeze the cgroup before doing anything else.
-        info!("received memory.high event -> freezing cgroup");
-        self.freeze().context("failed to freeze cgroup")?;
-
-        // We'll use this for logging durations
-        let start_time = Instant::now();
-
-        // Await the upscale until we have to unfreeze
-        let timed =
-            tokio::time::timeout(self.config.max_upscale_wait, self.await_upscale(upscales));
-
-        // Request the upscale
-        info!(
-            wait = ?self.config.max_upscale_wait,
-            "sending request for immediate upscaling",
-        );
-        self.upscale_requester
-            .send(())
-            .await
-            .context("failed to request upscale")?;
-
-        let waiting_on_upscale = match timed.await {
-            Ok(Ok(())) => {
-                info!(elapsed = ?start_time.elapsed(), "received upscale in time");
-                false
-            }
-            // **important**: unfreeze the cgroup before ?-reporting the error
-            Ok(Err(e)) => {
-                info!("error waiting for upscale -> thawing cgroup");
-                self.thaw()
-                    .context("failed to thaw cgroup after errored waiting for upscale")?;
-                Err(e.context("failed to await upscale"))?
-            }
-            Err(_) => {
-                info!(elapsed = ?self.config.max_upscale_wait, "timed out waiting for upscale");
-                true
-            }
-        };
-
-        info!("thawing cgroup");
-        self.thaw().context("failed to thaw cgroup")?;
-
-        Ok(waiting_on_upscale)
-    }
-
-    /// Checks whether we were just upscaled, returning the upscale's sequence
-    /// number if so.
-    #[tracing::instrument(skip_all)]
-    fn upscaled(
-        &self,
-        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
-    ) -> anyhow::Result<Option<u64>> {
-        let Sequenced { seqnum, data } = match upscales.try_recv() {
-            Ok(upscale) => upscale,
-            Err(TryRecvError::Empty) => return Ok(None),
-            Err(TryRecvError::Disconnected) => {
-                bail!("upscale notification channel was disconnected")
-            }
-        };
-
-        // Make sure to update the last upscale sequence number
-        self.last_upscale_seqnum.store(seqnum, Ordering::Release);
-        info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
-        Ok(Some(seqnum))
-    }
-
-    /// Await an upscale event, discarding any `memory.high` events received in
-    /// the process.
-    ///
-    /// This is used in `handle_memory_high_event`, where we need to listen
-    /// for upscales in particular so we know if we can thaw the cgroup early.
-    #[tracing::instrument(skip_all)]
-    async fn await_upscale(
-        &self,
-        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+        updates: watch::Sender<(Instant, MemoryHistory)>,
    ) -> anyhow::Result<()> {
-        let Sequenced { seqnum, .. } = upscales
-            .recv()
-            .await
-            .context("error listening for upscales")?;
+        // this requirement makes the code a bit easier to work with; see the config for more.
+        assert!(self.config.memory_history_len <= self.config.memory_history_log_interval);

-        self.last_upscale_seqnum.store(seqnum, Ordering::Release);
-        Ok(())
-    }
+        let mut ticker = tokio::time::interval(self.config.memory_poll_interval);
+        ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
+        // ticker.reset_immediately(); // FIXME: enable this once updating to tokio >= 1.30.0

-    /// Get the cgroup's name.
-    pub fn path(&self) -> &str {
-        self.cgroup.path()
-    }
-}
+        let mem_controller = self.memory()?;

-// Methods for manipulating the actual cgroup
-impl CgroupWatcher {
-    /// Get a handle on the freezer subsystem.
-    fn freezer(&self) -> anyhow::Result<&FreezerController> {
-        if let Some(Freezer(freezer)) = self
-            .cgroup
-            .subsystems()
-            .iter()
-            .find(|sub| matches!(sub, Freezer(_)))
-        {
-            Ok(freezer)
-        } else {
-            anyhow::bail!("could not find freezer subsystem")
+        // buffer for samples that will be logged. once full, it remains so.
+        let history_log_len = self.config.memory_history_log_interval;
+        let mut history_log_buf = vec![MemoryStatus::zeroed(); history_log_len];
+
+        for t in 0_u64.. {
+            ticker.tick().await;
+
+            let now = Instant::now();
+            let mem = Self::memory_usage(mem_controller);
+
+            let i = t as usize % history_log_len;
+            history_log_buf[i] = mem;
+
+            // We're taking *at most* memory_history_len values; we may be bounded by the total
+            // number of samples that have come in so far.
+            let samples_count = (t + 1).min(self.config.memory_history_len as u64) as usize;
+            // NB: in `ring_buf_recent_values_iter`, `i` is *inclusive*, which matches the fact
+            // that we just inserted a value there, so the end of the iterator will *include* the
+            // value at i, rather than stopping just short of it.
+            let samples = ring_buf_recent_values_iter(&history_log_buf, i, samples_count);
+
+            let summary = MemoryHistory {
+                avg_non_reclaimable: samples.map(|h| h.non_reclaimable).sum::<u64>()
+                    / samples_count as u64,
+                samples_count,
+                samples_span: self.config.memory_poll_interval * (samples_count - 1) as u32,
+            };
+
+            // Log the current history if it's time to do so. Because `history_log_buf` has length
+            // equal to the logging interval, we can just log the entire buffer every time we set
+            // the last entry, which also means that for this log line, we can ignore that it's a
+            // ring buffer (because all the entries are in order of increasing time).
+            if i == history_log_len - 1 {
+                info!(
+                    history = ?MemoryStatus::debug_slice(&history_log_buf),
+                    summary = ?summary,
+                    "Recent cgroup memory statistics history"
+                );
+            }
+
+            updates
+                .send((now, summary))
+                .context("failed to send MemoryHistory")?;
        }
-    }

-    /// Attempt to freeze the cgroup.
-    pub fn freeze(&self) -> anyhow::Result<()> {
-        self.freezer()
-            .context("failed to get freezer subsystem")?
-            .freeze()
-            .context("failed to freeze")
-    }
-
-    /// Attempt to thaw the cgroup.
-    pub fn thaw(&self) -> anyhow::Result<()> {
-        self.freezer()
-            .context("failed to get freezer subsystem")?
-            .thaw()
-            .context("failed to thaw")
+        unreachable!()
    }

    /// Get a handle on the memory subsystem.
-    ///
-    /// Note: this method does not require `self.memory_update_lock` because
-    /// getting a handle to the subsystem does not access any of the files we
-    /// care about, such as memory.high and memory.events
    fn memory(&self) -> anyhow::Result<&MemController> {
-        if let Some(Mem(memory)) = self
-            .cgroup
+        self.cgroup
            .subsystems()
            .iter()
-            .find(|sub| matches!(sub, Mem(_)))
-        {
-            Ok(memory)
-        } else {
-            anyhow::bail!("could not find memory subsystem")
-        }
-    }
-
-    /// Get cgroup current memory usage.
-    pub fn current_memory_usage(&self) -> anyhow::Result<u64> {
-        Ok(self
-            .memory()
-            .context("failed to get memory subsystem")?
-            .memory_stat()
-            .usage_in_bytes)
-    }
-
-    /// Set cgroup memory.high threshold.
-    pub fn set_memory_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
-        self.set_memory_high_internal(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64))
-    }
-
-    /// Set the cgroup's memory.high to 'max', disabling it.
-    pub fn unset_memory_high(&self) -> anyhow::Result<()> {
-        self.set_memory_high_internal(MaxValue::Max)
-    }
-
-    fn set_memory_high_internal(&self, value: MaxValue) -> anyhow::Result<()> {
-        self.memory()
-            .context("failed to get memory subsystem")?
-            .set_mem(cgroups_rs::memory::SetMemory {
-                low: None,
-                high: Some(value),
-                min: None,
-                max: None,
+            .find_map(|sub| match sub {
+                Subsystem::Mem(c) => Some(c),
+                _ => None,
            })
-            .map_err(anyhow::Error::from)
+            .ok_or_else(|| anyhow!("could not find memory subsystem"))
    }

-    /// Get memory.high threshold.
-    pub fn get_memory_high_bytes(&self) -> anyhow::Result<u64> {
-        let high = self
-            .memory()
-            .context("failed to get memory subsystem while getting memory statistics")?
-            .get_mem()
-            .map(|mem| mem.high)
-            .context("failed to get memory statistics from subsystem")?;
-        match high {
-            Some(MaxValue::Max) => Ok(i64::MAX as u64),
-            Some(MaxValue::Value(high)) => Ok(high as u64),
-            None => anyhow::bail!("failed to read memory.high from memory subsystem"),
+    /// Given a handle on the memory subsystem, returns the current memory information
+    fn memory_usage(mem_controller: &MemController) -> MemoryStatus {
+        let stat = mem_controller.memory_stat().stat;
+        MemoryStatus {
+            non_reclaimable: stat.active_anon + stat.inactive_anon,
        }
    }
 }
+
+// Helper function for `CgroupWatcher::watch`
+fn ring_buf_recent_values_iter<T>(
+    buf: &[T],
+    last_value_idx: usize,
+    count: usize,
+) -> impl '_ + Iterator<Item = &T> {
+    // Assertion carried over from `CgroupWatcher::watch`, to make the logic in this function
+    // easier (we only have to add `buf.len()` once, rather than a dynamic number of times).
+    assert!(count <= buf.len());
+
+    buf.iter()
+        // 'cycle' because the values could wrap around
+        .cycle()
+        // with 'cycle', this skip is more like 'offset', and functionally this is
+        // offsettting by 'last_value_idx - count (mod buf.len())', but we have to be
+        // careful to avoid underflow, so we pre-add buf.len().
+        // The '+ 1' is because `last_value_idx` is inclusive, rather than exclusive.
+        .skip((buf.len() + last_value_idx + 1 - count) % buf.len())
+        .take(count)
+}
+
+/// Summary of recent memory usage
+#[derive(Debug, Copy, Clone)]
+pub struct MemoryHistory {
+    /// Rolling average of non-reclaimable memory usage samples over the last `history_period`
+    pub avg_non_reclaimable: u64,
+
+    /// The number of samples used to construct this summary
+    pub samples_count: usize,
+    /// Total timespan between the first and last sample used for this summary
+    pub samples_span: Duration,
+}
+
+#[derive(Debug, Copy, Clone)]
+pub struct MemoryStatus {
+    non_reclaimable: u64,
+}
+
+impl MemoryStatus {
+    fn zeroed() -> Self {
+        MemoryStatus { non_reclaimable: 0 }
+    }
+
+    fn debug_slice(slice: &[Self]) -> impl '_ + Debug {
+        struct DS<'a>(&'a [MemoryStatus]);
+
+        impl<'a> Debug for DS<'a> {
+            fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+                f.debug_struct("[MemoryStatus]")
+                    .field(
+                        "non_reclaimable[..]",
+                        &Fields(self.0, |stat: &MemoryStatus| {
+                            BytesToGB(stat.non_reclaimable)
+                        }),
+                    )
+                    .finish()
+            }
+        }
+
+        struct Fields<'a, F>(&'a [MemoryStatus], F);
+
+        impl<'a, F: Fn(&MemoryStatus) -> T, T: Debug> Debug for Fields<'a, F> {
+            fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+                f.debug_list().entries(self.0.iter().map(&self.1)).finish()
+            }
+        }
+
+        struct BytesToGB(u64);
+
+        impl Debug for BytesToGB {
+            fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+                f.write_fmt(format_args!(
+                    "{:.3}Gi",
+                    self.0 as f64 / (1_u64 << 30) as f64
+                ))
+            }
+        }
+
+        DS(slice)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn ring_buf_iter() {
+        let buf = vec![0_i32, 1, 2, 3, 4, 5, 6, 7, 8, 9];
+
+        let values = |offset, count| {
+            super::ring_buf_recent_values_iter(&buf, offset, count)
+                .copied()
+                .collect::<Vec<i32>>()
+        };
+
+        // Boundary conditions: start, end, and entire thing:
+        assert_eq!(values(0, 1), [0]);
+        assert_eq!(values(3, 4), [0, 1, 2, 3]);
+        assert_eq!(values(9, 4), [6, 7, 8, 9]);
+        assert_eq!(values(9, 10), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
+
+        // "normal" operation: no wraparound
+        assert_eq!(values(7, 4), [4, 5, 6, 7]);
+
+        // wraparound:
+        assert_eq!(values(0, 4), [7, 8, 9, 0]);
+        assert_eq!(values(1, 4), [8, 9, 0, 1]);
+        assert_eq!(values(2, 4), [9, 0, 1, 2]);
+        assert_eq!(values(2, 10), [3, 4, 5, 6, 7, 8, 9, 0, 1, 2]);
+    }
+}
--- a/libs/vm_monitor/src/dispatcher.rs
+++ b/libs/vm_monitor/src/dispatcher.rs
@@ -12,12 +12,10 @@ use futures::{
    stream::{SplitSink, SplitStream},
    SinkExt, StreamExt,
 };
-use tokio::sync::mpsc;
 use tracing::info;

-use crate::cgroup::Sequenced;
 use crate::protocol::{
-    OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, Resources, PROTOCOL_MAX_VERSION,
+    OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, PROTOCOL_MAX_VERSION,
    PROTOCOL_MIN_VERSION,
 };

@@ -36,13 +34,6 @@ pub struct Dispatcher {
    /// We send messages to the agent through `sink`
    sink: SplitSink<WebSocket, Message>,

-    /// Used to notify the cgroup when we are upscaled.
-    pub(crate) notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
-
-    /// When the cgroup requests upscale it will send on this channel. In response
-    /// we send an `UpscaleRequst` to the agent.
-    pub(crate) request_upscale_events: mpsc::Receiver<()>,
-
    /// The protocol version we have agreed to use with the agent. This is negotiated
    /// during the creation of the dispatcher, and should be the highest shared protocol
    /// version.
@@ -61,11 +52,7 @@ impl Dispatcher {
    /// 1. Wait for the agent to sent the range of protocols it supports.
    /// 2. Send a protocol version that works for us as well, or an error if there
    ///    is no compatible version.
-    pub async fn new(
-        stream: WebSocket,
-        notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
-        request_upscale_events: mpsc::Receiver<()>,
-    ) -> anyhow::Result<Self> {
+    pub async fn new(stream: WebSocket) -> anyhow::Result<Self> {
        let (mut sink, mut source) = stream.split();

        // Figure out the highest protocol version we both support
@@ -119,22 +106,10 @@ impl Dispatcher {
        Ok(Self {
            sink,
            source,
-            notify_upscale_events,
-            request_upscale_events,
            proto_version: highest_shared_version,
        })
    }

-    /// Notify the cgroup manager that we have received upscale and wait for
-    /// the acknowledgement.
-    #[tracing::instrument(skip_all, fields(?resources))]
-    pub async fn notify_upscale(&self, resources: Sequenced<Resources>) -> anyhow::Result<()> {
-        self.notify_upscale_events
-            .send(resources)
-            .await
-            .context("failed to send resources and oneshot sender across channel")
-    }
-
    /// Send a message to the agent.
    ///
    /// Although this function is small, it has one major benefit: it is the only
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -5,18 +5,16 @@
 //! all functionality.

 use std::fmt::Debug;
-use std::sync::Arc;
 use std::time::{Duration, Instant};

 use anyhow::{bail, Context};
 use axum::extract::ws::{Message, WebSocket};
 use futures::StreamExt;
-use tokio::sync::broadcast;
-use tokio::sync::mpsc;
+use tokio::sync::{broadcast, watch};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn};

-use crate::cgroup::{CgroupWatcher, Sequenced};
+use crate::cgroup::{self, CgroupWatcher};
 use crate::dispatcher::Dispatcher;
 use crate::filecache::{FileCacheConfig, FileCacheState};
 use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
@@ -28,7 +26,7 @@ use crate::{bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel, Args
 pub struct Runner {
    config: Config,
    filecache: Option<FileCacheState>,
-    cgroup: Option<Arc<CgroupWatcher>>,
+    cgroup: Option<CgroupState>,
    dispatcher: Dispatcher,

    /// We "mint" new message ids by incrementing this counter and taking the value.
@@ -45,6 +43,14 @@ pub struct Runner {
    kill: broadcast::Receiver<()>,
 }

+#[derive(Debug)]
+struct CgroupState {
+    watcher: watch::Receiver<(Instant, cgroup::MemoryHistory)>,
+    /// If [`cgroup::MemoryHistory::avg_non_reclaimable`] exceeds `threshold`, we send upscale
+    /// requests.
+    threshold: u64,
+}
+
 /// Configuration for a `Runner`
 #[derive(Debug)]
 pub struct Config {
@@ -62,16 +68,56 @@ pub struct Config {
    /// upscale resource amounts (because we might not *actually* have been upscaled yet). This field
    /// should be removed once we have a better solution there.
    sys_buffer_bytes: u64,
+
+    /// Minimum fraction of total system memory reserved *before* the the cgroup threshold; in
+    /// other words, providing a ceiling for the highest value of the threshold by enforcing that
+    /// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the
+    /// threshold.
+    ///
+    /// For example, a value of `0.1` means that 10% of total memory must remain after exceeding
+    /// the threshold, so the value of the cgroup threshold would always be capped at 90% of total
+    /// memory.
+    ///
+    /// The default value of `0.15` means that we *guarantee* sending upscale requests if the
+    /// cgroup is using more than 85% of total memory (even if we're *not* separately reserving
+    /// memory for the file cache).
+    cgroup_min_overhead_fraction: f64,
+
+    cgroup_downscale_threshold_buffer_bytes: u64,
 }

 impl Default for Config {
    fn default() -> Self {
        Self {
            sys_buffer_bytes: 100 * MiB,
+            cgroup_min_overhead_fraction: 0.15,
+            cgroup_downscale_threshold_buffer_bytes: 100 * MiB,
        }
    }
 }

+impl Config {
+    fn cgroup_threshold(&self, total_mem: u64, file_cache_disk_size: u64) -> u64 {
+        // If the file cache is in tmpfs, then it will count towards shmem usage of the cgroup,
+        // and thus be non-reclaimable, so we should allow for additional memory usage.
+        //
+        // If the file cache sits on disk, our desired stable system state is for it to be fully
+        // page cached (its contents should only be paged to/from disk in situations where we can't
+        // upscale fast enough). Page-cached memory is reclaimable, so we need to lower the
+        // threshold for non-reclaimable memory so we scale up *before* the kernel starts paging
+        // out the file cache.
+        let memory_remaining_for_cgroup = total_mem.saturating_sub(file_cache_disk_size);
+
+        // Even if we're not separately making room for the file cache (if it's in tmpfs), we still
+        // want our threshold to be met gracefully instead of letting postgres get OOM-killed.
+        // So we guarantee that there's at least `cgroup_min_overhead_fraction` of total memory
+        // remaining above the threshold.
+        let max_threshold = (total_mem as f64 * (1.0 - self.cgroup_min_overhead_fraction)) as u64;
+
+        memory_remaining_for_cgroup.min(max_threshold)
+    }
+}
+
 impl Runner {
    /// Create a new monitor.
    #[tracing::instrument(skip_all, fields(?config, ?args))]
@@ -87,12 +133,7 @@ impl Runner {
            "invalid monitor Config: sys_buffer_bytes cannot be 0"
        );

-        // *NOTE*: the dispatcher and cgroup manager talk through these channels
-        // so make sure they each get the correct half, nothing is droppped, etc.
-        let (notified_send, notified_recv) = mpsc::channel(1);
-        let (requesting_send, requesting_recv) = mpsc::channel(1);
-
-        let dispatcher = Dispatcher::new(ws, notified_send, requesting_recv)
+        let dispatcher = Dispatcher::new(ws)
            .await
            .context("error creating new dispatcher")?;

@@ -106,46 +147,10 @@ impl Runner {
            kill,
        };

-        // If we have both the cgroup and file cache integrations enabled, it's possible for
-        // temporary failures to result in cgroup throttling (from memory.high), that in turn makes
-        // it near-impossible to connect to the file cache (because it times out). Unfortunately,
-        // we *do* still want to determine the file cache size before setting the cgroup's
-        // memory.high, so it's not as simple as just swapping the order.
-        //
-        // Instead, the resolution here is that on vm-monitor startup (note: happens on each
-        // connection from autoscaler-agent, possibly multiple times per compute_ctl lifecycle), we
-        // temporarily unset memory.high, to allow any existing throttling to dissipate. It's a bit
-        // of a hacky solution, but helps with reliability.
-        if let Some(name) = &args.cgroup {
-            // Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
-            // now, and then set limits later.
-            info!("initializing cgroup");
-
-            let (cgroup, cgroup_event_stream) = CgroupWatcher::new(name.clone(), requesting_send)
-                .context("failed to create cgroup manager")?;
-
-            info!("temporarily unsetting memory.high");
-
-            // Temporarily un-set cgroup memory.high; see above.
-            cgroup
-                .unset_memory_high()
-                .context("failed to unset memory.high")?;
-
-            let cgroup = Arc::new(cgroup);
-
-            let cgroup_clone = Arc::clone(&cgroup);
-            spawn_with_cancel(
-                token.clone(),
-                |_| error!("cgroup watcher terminated"),
-                async move { cgroup_clone.watch(notified_recv, cgroup_event_stream).await },
-            );
-
-            state.cgroup = Some(cgroup);
-        }
-
-        let mut file_cache_reserved_bytes = 0;
        let mem = get_total_system_memory();

+        let mut file_cache_disk_size = 0;
+
        // We need to process file cache initialization before cgroup initialization, so that the memory
        // allocated to the file cache is appropriately taken into account when we decide the cgroup's
        // memory limits.
@@ -156,7 +161,7 @@ impl Runner {
                false => FileCacheConfig::default_in_memory(),
            };

-            let mut file_cache = FileCacheState::new(connstr, config, token)
+            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
                .await
                .context("failed to create file cache")?;

@@ -181,23 +186,40 @@ impl Runner {
            if actual_size != new_size {
                info!("file cache size actually got set to {actual_size}")
            }
-            // Mark the resources given to the file cache as reserved, but only if it's in memory.
-            if !args.file_cache_on_disk {
-                file_cache_reserved_bytes = actual_size;
+
+            if args.file_cache_on_disk {
+                file_cache_disk_size = actual_size;
            }

            state.filecache = Some(file_cache);
        }

-        if let Some(cgroup) = &state.cgroup {
-            let available = mem - file_cache_reserved_bytes;
-            let value = cgroup.config.calculate_memory_high_value(available);
+        if let Some(name) = &args.cgroup {
+            // Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
+            // now, and then set limits later.
+            info!("initializing cgroup");

-            info!(value, "setting memory.high");
+            let cgroup =
+                CgroupWatcher::new(name.clone()).context("failed to create cgroup manager")?;

-            cgroup
-                .set_memory_high_bytes(value)
-                .context("failed to set cgroup memory.high")?;
+            let init_value = cgroup::MemoryHistory {
+                avg_non_reclaimable: 0,
+                samples_count: 0,
+                samples_span: Duration::ZERO,
+            };
+            let (hist_tx, hist_rx) = watch::channel((Instant::now(), init_value));
+
+            spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move {
+                cgroup.watch(hist_tx).await
+            });
+
+            let threshold = state.config.cgroup_threshold(mem, file_cache_disk_size);
+            info!(threshold, "set initial cgroup threshold",);
+
+            state.cgroup = Some(CgroupState {
+                watcher: hist_rx,
+                threshold,
+            });
        }

        Ok(state)
@@ -217,28 +239,51 @@ impl Runner {

        let requested_mem = target.mem;
        let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
-        let expected_file_cache_mem_usage = self
+        let (expected_file_cache_size, expected_file_cache_disk_size) = self
            .filecache
            .as_ref()
-            .map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
-            .unwrap_or(0);
-        let mut new_cgroup_mem_high = 0;
+            .map(|file_cache| {
+                let size = file_cache.config.calculate_cache_size(usable_system_memory);
+                match file_cache.config.in_memory {
+                    true => (size, 0),
+                    false => (size, size),
+                }
+            })
+            .unwrap_or((0, 0));
        if let Some(cgroup) = &self.cgroup {
-            new_cgroup_mem_high = cgroup
+            let (last_time, last_history) = *cgroup.watcher.borrow();
+
+            // NB: The ordering of these conditions is intentional. During startup, we should deny
+            // downscaling until we have enough information to determine that it's safe to do so
+            // (i.e. enough samples have come in). But if it's been a while and we *still* haven't
+            // received any information, we should *fail* instead of just denying downscaling.
+            //
+            // `last_time` is set to `Instant::now()` on startup, so checking `last_time.elapsed()`
+            // serves double-duty: it trips if we haven't received *any* metrics for long enough,
+            // OR if we haven't received metrics *recently enough*.
+            //
+            // TODO: make the duration here configurable.
+            if last_time.elapsed() > Duration::from_secs(5) {
+                bail!("haven't gotten cgroup memory stats recently enough to determine downscaling information");
+            } else if last_history.samples_count <= 1 {
+                let status = "haven't received enough cgroup memory stats yet";
+                info!(status, "discontinuing downscale");
+                return Ok((false, status.to_owned()));
+            }
+
+            let new_threshold = self
                .config
-                .calculate_memory_high_value(usable_system_memory - expected_file_cache_mem_usage);
+                .cgroup_threshold(usable_system_memory, expected_file_cache_disk_size);

-            let current = cgroup
-                .current_memory_usage()
-                .context("failed to fetch cgroup memory")?;
+            let current = last_history.avg_non_reclaimable;

-            if new_cgroup_mem_high < current + cgroup.config.memory_high_buffer_bytes {
+            if new_threshold < current + self.config.cgroup_downscale_threshold_buffer_bytes {
                let status = format!(
-                    "{}: {} MiB (new high) < {} (current usage) + {} (buffer)",
-                    "calculated memory.high too low",
-                    bytes_to_mebibytes(new_cgroup_mem_high),
+                    "{}: {} MiB (new threshold) < {} (current usage) + {} (downscale buffer)",
+                    "calculated memory threshold too low",
+                    bytes_to_mebibytes(new_threshold),
                    bytes_to_mebibytes(current),
-                    bytes_to_mebibytes(cgroup.config.memory_high_buffer_bytes)
+                    bytes_to_mebibytes(self.config.cgroup_downscale_threshold_buffer_bytes)
                );

                info!(status, "discontinuing downscale");
@@ -249,14 +294,14 @@ impl Runner {

        // The downscaling has been approved. Downscale the file cache, then the cgroup.
        let mut status = vec![];
-        let mut file_cache_mem_usage = 0;
+        let mut file_cache_disk_size = 0;
        if let Some(file_cache) = &mut self.filecache {
            let actual_usage = file_cache
-                .set_file_cache_size(expected_file_cache_mem_usage)
+                .set_file_cache_size(expected_file_cache_size)
                .await
                .context("failed to set file cache size")?;
-            if file_cache.config.in_memory {
-                file_cache_mem_usage = actual_usage;
+            if !file_cache.config.in_memory {
+                file_cache_disk_size = actual_usage;
            }
            let message = format!(
                "set file cache size to {} MiB (in memory = {})",
@@ -267,24 +312,18 @@ impl Runner {
            status.push(message);
        }

-        if let Some(cgroup) = &self.cgroup {
-            let available_memory = usable_system_memory - file_cache_mem_usage;
-
-            if file_cache_mem_usage != expected_file_cache_mem_usage {
-                new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
-            }
-
-            // new_cgroup_mem_high is initialized to 0 but it is guaranteed to not be here
-            // since it is properly initialized in the previous cgroup if let block
-            cgroup
-                .set_memory_high_bytes(new_cgroup_mem_high)
-                .context("failed to set cgroup memory.high")?;
+        if let Some(cgroup) = &mut self.cgroup {
+            let new_threshold = self
+                .config
+                .cgroup_threshold(usable_system_memory, file_cache_disk_size);

            let message = format!(
-                "set cgroup memory.high to {} MiB, of new max {} MiB",
-                bytes_to_mebibytes(new_cgroup_mem_high),
-                bytes_to_mebibytes(available_memory)
+                "set cgroup memory threshold from {} MiB to {} MiB, of new total {} MiB",
+                bytes_to_mebibytes(cgroup.threshold),
+                bytes_to_mebibytes(new_threshold),
+                bytes_to_mebibytes(usable_system_memory)
            );
+            cgroup.threshold = new_threshold;
            info!("downscale: {message}");
            status.push(message);
        }
@@ -305,8 +344,7 @@ impl Runner {
        let new_mem = resources.mem;
        let usable_system_memory = new_mem.saturating_sub(self.config.sys_buffer_bytes);

-        // Get the file cache's expected contribution to the memory usage
-        let mut file_cache_mem_usage = 0;
+        let mut file_cache_disk_size = 0;
        if let Some(file_cache) = &mut self.filecache {
            let expected_usage = file_cache.config.calculate_cache_size(usable_system_memory);
            info!(
@@ -319,8 +357,8 @@ impl Runner {
                .set_file_cache_size(expected_usage)
                .await
                .context("failed to set file cache size")?;
-            if file_cache.config.in_memory {
-                file_cache_mem_usage = actual_usage;
+            if !file_cache.config.in_memory {
+                file_cache_disk_size = actual_usage;
            }

            if actual_usage != expected_usage {
@@ -332,18 +370,18 @@ impl Runner {
            }
        }

-        if let Some(cgroup) = &self.cgroup {
-            let available_memory = usable_system_memory - file_cache_mem_usage;
-            let new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
+        if let Some(cgroup) = &mut self.cgroup {
+            let new_threshold = self
+                .config
+                .cgroup_threshold(usable_system_memory, file_cache_disk_size);
+
            info!(
-                target = bytes_to_mebibytes(new_cgroup_mem_high),
-                total = bytes_to_mebibytes(new_mem),
-                name = cgroup.path(),
-                "updating cgroup memory.high",
+                "set cgroup memory threshold from {} MiB to {} MiB of new total {} MiB",
+                bytes_to_mebibytes(cgroup.threshold),
+                bytes_to_mebibytes(new_threshold),
+                bytes_to_mebibytes(usable_system_memory)
            );
-            cgroup
-                .set_memory_high_bytes(new_cgroup_mem_high)
-                .context("failed to set cgroup memory.high")?;
+            cgroup.threshold = new_threshold;
        }

        Ok(())
@@ -361,10 +399,6 @@ impl Runner {
                self.handle_upscale(granted)
                    .await
                    .context("failed to handle upscale")?;
-                self.dispatcher
-                    .notify_upscale(Sequenced::new(granted))
-                    .await
-                    .context("failed to notify notify cgroup of upscale")?;
                Ok(Some(OutboundMsg::new(
                    OutboundMsgKind::UpscaleConfirmation {},
                    id,
@@ -408,33 +442,53 @@ impl Runner {
                        Err(e) => bail!("failed to receive kill signal: {e}")
                    }
                }
-                // we need to propagate an upscale request
-                request = self.dispatcher.request_upscale_events.recv(), if self.cgroup.is_some() => {
-                    if request.is_none() {
-                        bail!("failed to listen for upscale event from cgroup")
+
+                // New memory stats from the cgroup, *may* need to request upscaling, if we've
+                // exceeded the threshold
+                result = self.cgroup.as_mut().unwrap().watcher.changed(), if self.cgroup.is_some() => {
+                    result.context("failed to receive from cgroup memory stats watcher")?;
+
+                    let cgroup = self.cgroup.as_ref().unwrap();
+
+                    let (_time, cgroup_mem_stat) = *cgroup.watcher.borrow();
+
+                    // If we haven't exceeded the threshold, then we're all ok
+                    if cgroup_mem_stat.avg_non_reclaimable < cgroup.threshold {
+                        continue;
                    }

-                    // If it's been less than 1 second since the last time we requested upscaling,
-                    // ignore the event, to avoid spamming the agent (otherwise, this can happen
-                    // ~1k times per second).
+                    // Otherwise, we generally want upscaling. But, if it's been less than 1 second
+                    // since the last time we requested upscaling, ignore the event, to avoid
+                    // spamming the agent.
                    if let Some(t) = self.last_upscale_request_at {
                        let elapsed = t.elapsed();
                        if elapsed < Duration::from_secs(1) {
-                            info!(elapsed_millis = elapsed.as_millis(), "cgroup asked for upscale but too soon to forward the request, ignoring");
+                            info!(
+                                elapsed_millis = elapsed.as_millis(),
+                                avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable),
+                                threshold = bytes_to_mebibytes(cgroup.threshold),
+                                "cgroup memory stats are high enough to upscale but too soon to forward the request, ignoring",
+                            );
                            continue;
                        }
                    }

                    self.last_upscale_request_at = Some(Instant::now());

-                    info!("cgroup asking for upscale; forwarding request");
+                    info!(
+                        avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable),
+                        threshold = bytes_to_mebibytes(cgroup.threshold),
+                        "cgroup memory stats are high enough to upscale, requesting upscale",
+                    );
+
                    self.counter += 2; // Increment, preserving parity (i.e. keep the
                                       // counter odd). See the field comment for more.
                    self.dispatcher
                        .send(OutboundMsg::new(OutboundMsgKind::UpscaleRequest {}, self.counter))
                        .await
                        .context("failed to send message")?;
-                }
+                },
+
                // there is a message from the agent
                msg = self.dispatcher.source.next() => {
                    if let Some(msg) = msg {
@@ -462,11 +516,14 @@ impl Runner {
                                    Ok(Some(out)) => out,
                                    Ok(None) => continue,
                                    Err(e) => {
-                                        let error = e.to_string();
-                                        warn!(?error, "error handling message");
+                                        // use {:#} for our logging because the display impl only
+                                        // gives the outermost cause, and the debug impl
+                                        // pretty-prints the error, whereas {:#} contains all the
+                                        // causes, but is compact (no newlines).
+                                        warn!(error = format!("{e:#}"), "error handling message");
                                        OutboundMsg::new(
                                            OutboundMsgKind::InternalError {
-                                                error
+                                                error: e.to_string(),
                                            },
                                            message.id
                                        )
--- a/libs/walproposer/Cargo.toml
+++ b/libs/walproposer/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "walproposer"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+utils.workspace = true
+postgres_ffi.workspace = true
+
+workspace_hack.workspace = true
+
+[build-dependencies]
+anyhow.workspace = true
+bindgen.workspace = true
--- a/libs/walproposer/bindgen_deps.h
+++ b/libs/walproposer/bindgen_deps.h
@@ -0,0 +1 @@
+#include "walproposer.h"
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -0,0 +1,113 @@
+use std::{env, path::PathBuf, process::Command};
+
+use anyhow::{anyhow, Context};
+use bindgen::CargoCallbacks;
+
+fn main() -> anyhow::Result<()> {
+    // Tell cargo to invalidate the built crate whenever the wrapper changes
+    println!("cargo:rerun-if-changed=bindgen_deps.h");
+
+    // Finding the location of built libraries and Postgres C headers:
+    // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `<project_root>/pg_install`
+    // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `<project_root>/pg_install/{PG_MAJORVERSION}/include/postgresql/server`
+    let pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") {
+        postgres_install_dir.into()
+    } else {
+        PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../pg_install")
+    };
+
+    let pg_install_abs = std::fs::canonicalize(pg_install_dir)?;
+    let walproposer_lib_dir = pg_install_abs.join("build/walproposer-lib");
+    let walproposer_lib_search_str = walproposer_lib_dir
+        .to_str()
+        .ok_or(anyhow!("Bad non-UTF path"))?;
+
+    let pgxn_neon = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../pgxn/neon");
+    let pgxn_neon = std::fs::canonicalize(pgxn_neon)?;
+    let pgxn_neon = pgxn_neon.to_str().ok_or(anyhow!("Bad non-UTF path"))?;
+
+    println!("cargo:rustc-link-lib=static=pgport");
+    println!("cargo:rustc-link-lib=static=pgcommon");
+    println!("cargo:rustc-link-lib=static=walproposer");
+    println!("cargo:rustc-link-search={walproposer_lib_search_str}");
+
+    let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config");
+    let inc_server_path: String = if pg_config_bin.exists() {
+        let output = Command::new(pg_config_bin)
+            .arg("--includedir-server")
+            .output()
+            .context("failed to execute `pg_config --includedir-server`")?;
+
+        if !output.status.success() {
+            panic!("`pg_config --includedir-server` failed")
+        }
+
+        String::from_utf8(output.stdout)
+            .context("pg_config output is not UTF-8")?
+            .trim_end()
+            .into()
+    } else {
+        let server_path = pg_install_abs
+            .join("v16")
+            .join("include")
+            .join("postgresql")
+            .join("server")
+            .into_os_string();
+        server_path
+            .into_string()
+            .map_err(|s| anyhow!("Bad postgres server path {s:?}"))?
+    };
+
+    // The bindgen::Builder is the main entry point
+    // to bindgen, and lets you build up options for
+    // the resulting bindings.
+    let bindings = bindgen::Builder::default()
+        // The input header we would like to generate
+        // bindings for.
+        .header("bindgen_deps.h")
+        // Tell cargo to invalidate the built crate whenever any of the
+        // included header files changed.
+        .parse_callbacks(Box::new(CargoCallbacks))
+        .allowlist_type("WalProposer")
+        .allowlist_type("WalProposerConfig")
+        .allowlist_type("walproposer_api")
+        .allowlist_function("WalProposerCreate")
+        .allowlist_function("WalProposerStart")
+        .allowlist_function("WalProposerBroadcast")
+        .allowlist_function("WalProposerPoll")
+        .allowlist_function("WalProposerFree")
+        .allowlist_var("DEBUG5")
+        .allowlist_var("DEBUG4")
+        .allowlist_var("DEBUG3")
+        .allowlist_var("DEBUG2")
+        .allowlist_var("DEBUG1")
+        .allowlist_var("LOG")
+        .allowlist_var("INFO")
+        .allowlist_var("NOTICE")
+        .allowlist_var("WARNING")
+        .allowlist_var("ERROR")
+        .allowlist_var("FATAL")
+        .allowlist_var("PANIC")
+        .allowlist_var("WPEVENT")
+        .allowlist_var("WL_LATCH_SET")
+        .allowlist_var("WL_SOCKET_READABLE")
+        .allowlist_var("WL_SOCKET_WRITEABLE")
+        .allowlist_var("WL_TIMEOUT")
+        .allowlist_var("WL_SOCKET_CLOSED")
+        .allowlist_var("WL_SOCKET_MASK")
+        .clang_arg("-DWALPROPOSER_LIB")
+        .clang_arg(format!("-I{pgxn_neon}"))
+        .clang_arg(format!("-I{inc_server_path}"))
+        // Finish the builder and generate the bindings.
+        .generate()
+        // Unwrap the Result and panic on failure.
+        .expect("Unable to generate bindings");
+
+    // Write the bindings to the $OUT_DIR/bindings.rs file.
+    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()).join("bindings.rs");
+    bindings
+        .write_to_file(out_path)
+        .expect("Couldn't write bindings!");
+
+    Ok(())
+}
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -0,0 +1,455 @@
+#![allow(dead_code)]
+
+use std::ffi::CStr;
+use std::ffi::CString;
+
+use crate::bindings::uint32;
+use crate::bindings::walproposer_api;
+use crate::bindings::PGAsyncReadResult;
+use crate::bindings::PGAsyncWriteResult;
+use crate::bindings::Safekeeper;
+use crate::bindings::Size;
+use crate::bindings::StringInfoData;
+use crate::bindings::TimeLineID;
+use crate::bindings::TimestampTz;
+use crate::bindings::WalProposer;
+use crate::bindings::WalProposerConnStatusType;
+use crate::bindings::WalProposerConnectPollStatusType;
+use crate::bindings::WalProposerExecStatusType;
+use crate::bindings::WalproposerShmemState;
+use crate::bindings::XLogRecPtr;
+use crate::walproposer::ApiImpl;
+use crate::walproposer::WaitResult;
+
+extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemState {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).get_shmem_state()
+    }
+}
+
+extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).start_streaming(startpos)
+    }
+}
+
+extern "C" fn get_flush_rec_ptr(wp: *mut WalProposer) -> XLogRecPtr {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).get_flush_rec_ptr()
+    }
+}
+
+extern "C" fn get_current_timestamp(wp: *mut WalProposer) -> TimestampTz {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).get_current_timestamp()
+    }
+}
+
+extern "C" fn conn_error_message(sk: *mut Safekeeper) -> *mut ::std::os::raw::c_char {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        let msg = (*api).conn_error_message(&mut (*sk));
+        let msg = CString::new(msg).unwrap();
+        // TODO: fix leaking error message
+        msg.into_raw()
+    }
+}
+
+extern "C" fn conn_status(sk: *mut Safekeeper) -> WalProposerConnStatusType {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_status(&mut (*sk))
+    }
+}
+
+extern "C" fn conn_connect_start(sk: *mut Safekeeper) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_connect_start(&mut (*sk))
+    }
+}
+
+extern "C" fn conn_connect_poll(sk: *mut Safekeeper) -> WalProposerConnectPollStatusType {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_connect_poll(&mut (*sk))
+    }
+}
+
+extern "C" fn conn_send_query(sk: *mut Safekeeper, query: *mut ::std::os::raw::c_char) -> bool {
+    let query = unsafe { CStr::from_ptr(query) };
+    let query = query.to_str().unwrap();
+
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_send_query(&mut (*sk), query)
+    }
+}
+
+extern "C" fn conn_get_query_result(sk: *mut Safekeeper) -> WalProposerExecStatusType {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_get_query_result(&mut (*sk))
+    }
+}
+
+extern "C" fn conn_flush(sk: *mut Safekeeper) -> ::std::os::raw::c_int {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_flush(&mut (*sk))
+    }
+}
+
+extern "C" fn conn_finish(sk: *mut Safekeeper) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_finish(&mut (*sk))
+    }
+}
+
+extern "C" fn conn_async_read(
+    sk: *mut Safekeeper,
+    buf: *mut *mut ::std::os::raw::c_char,
+    amount: *mut ::std::os::raw::c_int,
+) -> PGAsyncReadResult {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        let (res, result) = (*api).conn_async_read(&mut (*sk));
+
+        // This function has guarantee that returned buf will be valid until
+        // the next call. So we can store a Vec in each Safekeeper and reuse
+        // it on the next call.
+        let mut inbuf = take_vec_u8(&mut (*sk).inbuf).unwrap_or_default();
+
+        inbuf.clear();
+        inbuf.extend_from_slice(res);
+
+        // Put a Vec back to sk->inbuf and return data ptr.
+        *buf = store_vec_u8(&mut (*sk).inbuf, inbuf);
+        *amount = res.len() as i32;
+
+        result
+    }
+}
+
+extern "C" fn conn_async_write(
+    sk: *mut Safekeeper,
+    buf: *const ::std::os::raw::c_void,
+    size: usize,
+) -> PGAsyncWriteResult {
+    unsafe {
+        let buf = std::slice::from_raw_parts(buf as *const u8, size);
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_async_write(&mut (*sk), buf)
+    }
+}
+
+extern "C" fn conn_blocking_write(
+    sk: *mut Safekeeper,
+    buf: *const ::std::os::raw::c_void,
+    size: usize,
+) -> bool {
+    unsafe {
+        let buf = std::slice::from_raw_parts(buf as *const u8, size);
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_blocking_write(&mut (*sk), buf)
+    }
+}
+
+extern "C" fn recovery_download(
+    sk: *mut Safekeeper,
+    _timeline: TimeLineID,
+    startpos: XLogRecPtr,
+    endpos: XLogRecPtr,
+) -> bool {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).recovery_download(&mut (*sk), startpos, endpos)
+    }
+}
+
+extern "C" fn wal_read(
+    sk: *mut Safekeeper,
+    buf: *mut ::std::os::raw::c_char,
+    startptr: XLogRecPtr,
+    count: Size,
+) {
+    unsafe {
+        let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).wal_read(&mut (*sk), buf, startptr)
+    }
+}
+
+extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).wal_reader_allocate(&mut (*sk));
+    }
+}
+
+extern "C" fn free_event_set(wp: *mut WalProposer) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).free_event_set(&mut (*wp));
+    }
+}
+
+extern "C" fn init_event_set(wp: *mut WalProposer) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).init_event_set(&mut (*wp));
+    }
+}
+
+extern "C" fn update_event_set(sk: *mut Safekeeper, events: uint32) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).update_event_set(&mut (*sk), events);
+    }
+}
+
+extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).add_safekeeper_event_set(&mut (*sk), events);
+    }
+}
+
+extern "C" fn wait_event_set(
+    wp: *mut WalProposer,
+    timeout: ::std::os::raw::c_long,
+    event_sk: *mut *mut Safekeeper,
+    events: *mut uint32,
+) -> ::std::os::raw::c_int {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        let result = (*api).wait_event_set(&mut (*wp), timeout);
+        match result {
+            WaitResult::Latch => {
+                *event_sk = std::ptr::null_mut();
+                *events = crate::bindings::WL_LATCH_SET;
+                1
+            }
+            WaitResult::Timeout => {
+                *event_sk = std::ptr::null_mut();
+                *events = crate::bindings::WL_TIMEOUT;
+                0
+            }
+            WaitResult::Network(sk, event_mask) => {
+                *event_sk = sk;
+                *events = event_mask;
+                1
+            }
+        }
+    }
+}
+
+extern "C" fn strong_random(
+    wp: *mut WalProposer,
+    buf: *mut ::std::os::raw::c_void,
+    len: usize,
+) -> bool {
+    unsafe {
+        let buf = std::slice::from_raw_parts_mut(buf as *mut u8, len);
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).strong_random(buf)
+    }
+}
+
+extern "C" fn get_redo_start_lsn(wp: *mut WalProposer) -> XLogRecPtr {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).get_redo_start_lsn()
+    }
+}
+
+extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).finish_sync_safekeepers(lsn)
+    }
+}
+
+extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLogRecPtr) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).process_safekeeper_feedback(&mut (*wp), commit_lsn)
+    }
+}
+
+extern "C" fn confirm_wal_streamed(wp: *mut WalProposer, lsn: XLogRecPtr) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).confirm_wal_streamed(&mut (*wp), lsn)
+    }
+}
+
+extern "C" fn log_internal(
+    wp: *mut WalProposer,
+    level: ::std::os::raw::c_int,
+    line: *const ::std::os::raw::c_char,
+) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        let line = CStr::from_ptr(line);
+        let line = line.to_str().unwrap();
+        (*api).log_internal(&mut (*wp), Level::from(level as u32), line)
+    }
+}
+
+extern "C" fn after_election(wp: *mut WalProposer) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).after_election(&mut (*wp))
+    }
+}
+
+#[derive(Debug)]
+pub enum Level {
+    Debug5,
+    Debug4,
+    Debug3,
+    Debug2,
+    Debug1,
+    Log,
+    Info,
+    Notice,
+    Warning,
+    Error,
+    Fatal,
+    Panic,
+    WPEvent,
+}
+
+impl Level {
+    pub fn from(elevel: u32) -> Level {
+        use crate::bindings::*;
+
+        match elevel {
+            DEBUG5 => Level::Debug5,
+            DEBUG4 => Level::Debug4,
+            DEBUG3 => Level::Debug3,
+            DEBUG2 => Level::Debug2,
+            DEBUG1 => Level::Debug1,
+            LOG => Level::Log,
+            INFO => Level::Info,
+            NOTICE => Level::Notice,
+            WARNING => Level::Warning,
+            ERROR => Level::Error,
+            FATAL => Level::Fatal,
+            PANIC => Level::Panic,
+            WPEVENT => Level::WPEvent,
+            _ => panic!("unknown log level {}", elevel),
+        }
+    }
+}
+
+pub(crate) fn create_api() -> walproposer_api {
+    walproposer_api {
+        get_shmem_state: Some(get_shmem_state),
+        start_streaming: Some(start_streaming),
+        get_flush_rec_ptr: Some(get_flush_rec_ptr),
+        get_current_timestamp: Some(get_current_timestamp),
+        conn_error_message: Some(conn_error_message),
+        conn_status: Some(conn_status),
+        conn_connect_start: Some(conn_connect_start),
+        conn_connect_poll: Some(conn_connect_poll),
+        conn_send_query: Some(conn_send_query),
+        conn_get_query_result: Some(conn_get_query_result),
+        conn_flush: Some(conn_flush),
+        conn_finish: Some(conn_finish),
+        conn_async_read: Some(conn_async_read),
+        conn_async_write: Some(conn_async_write),
+        conn_blocking_write: Some(conn_blocking_write),
+        recovery_download: Some(recovery_download),
+        wal_read: Some(wal_read),
+        wal_reader_allocate: Some(wal_reader_allocate),
+        free_event_set: Some(free_event_set),
+        init_event_set: Some(init_event_set),
+        update_event_set: Some(update_event_set),
+        add_safekeeper_event_set: Some(add_safekeeper_event_set),
+        wait_event_set: Some(wait_event_set),
+        strong_random: Some(strong_random),
+        get_redo_start_lsn: Some(get_redo_start_lsn),
+        finish_sync_safekeepers: Some(finish_sync_safekeepers),
+        process_safekeeper_feedback: Some(process_safekeeper_feedback),
+        confirm_wal_streamed: Some(confirm_wal_streamed),
+        log_internal: Some(log_internal),
+        after_election: Some(after_election),
+    }
+}
+
+impl std::fmt::Display for Level {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+
+/// Take ownership of `Vec<u8>` from StringInfoData.
+pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option<Vec<u8>> {
+    if pg.data.is_null() {
+        return None;
+    }
+
+    let ptr = pg.data as *mut u8;
+    let length = pg.len as usize;
+    let capacity = pg.maxlen as usize;
+
+    pg.data = std::ptr::null_mut();
+    pg.len = 0;
+    pg.maxlen = 0;
+
+    unsafe { Some(Vec::from_raw_parts(ptr, length, capacity)) }
+}
+
+/// Store `Vec<u8>` in StringInfoData.
+fn store_vec_u8(pg: &mut StringInfoData, vec: Vec<u8>) -> *mut ::std::os::raw::c_char {
+    let ptr = vec.as_ptr() as *mut ::std::os::raw::c_char;
+    let length = vec.len();
+    let capacity = vec.capacity();
+
+    assert!(pg.data.is_null());
+
+    pg.data = ptr;
+    pg.len = length as i32;
+    pg.maxlen = capacity as i32;
+
+    std::mem::forget(vec);
+
+    ptr
+}
--- a/libs/walproposer/src/lib.rs
+++ b/libs/walproposer/src/lib.rs
@@ -0,0 +1,14 @@
+pub mod bindings {
+    #![allow(non_upper_case_globals)]
+    #![allow(non_camel_case_types)]
+    #![allow(non_snake_case)]
+    // bindgen creates some unsafe code with no doc comments.
+    #![allow(clippy::missing_safety_doc)]
+    // noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code.
+    #![allow(clippy::useless_transmute)]
+
+    include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
+}
+
+pub mod api_bindings;
+pub mod walproposer;
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -0,0 +1,485 @@
+use std::ffi::CString;
+
+use postgres_ffi::WAL_SEGMENT_SIZE;
+use utils::id::TenantTimelineId;
+
+use crate::{
+    api_bindings::{create_api, take_vec_u8, Level},
+    bindings::{
+        Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, WalProposerFree,
+        WalProposerStart,
+    },
+};
+
+/// Rust high-level wrapper for C walproposer API. Many methods are not required
+/// for simple cases, hence todo!() in default implementations.
+///
+/// Refer to `pgxn/neon/walproposer.h` for documentation.
+pub trait ApiImpl {
+    fn get_shmem_state(&self) -> &mut crate::bindings::WalproposerShmemState {
+        todo!()
+    }
+
+    fn start_streaming(&self, _startpos: u64) {
+        todo!()
+    }
+
+    fn get_flush_rec_ptr(&self) -> u64 {
+        todo!()
+    }
+
+    fn get_current_timestamp(&self) -> i64 {
+        todo!()
+    }
+
+    fn conn_error_message(&self, _sk: &mut Safekeeper) -> String {
+        todo!()
+    }
+
+    fn conn_status(&self, _sk: &mut Safekeeper) -> crate::bindings::WalProposerConnStatusType {
+        todo!()
+    }
+
+    fn conn_connect_start(&self, _sk: &mut Safekeeper) {
+        todo!()
+    }
+
+    fn conn_connect_poll(
+        &self,
+        _sk: &mut Safekeeper,
+    ) -> crate::bindings::WalProposerConnectPollStatusType {
+        todo!()
+    }
+
+    fn conn_send_query(&self, _sk: &mut Safekeeper, _query: &str) -> bool {
+        todo!()
+    }
+
+    fn conn_get_query_result(
+        &self,
+        _sk: &mut Safekeeper,
+    ) -> crate::bindings::WalProposerExecStatusType {
+        todo!()
+    }
+
+    fn conn_flush(&self, _sk: &mut Safekeeper) -> i32 {
+        todo!()
+    }
+
+    fn conn_finish(&self, _sk: &mut Safekeeper) {
+        todo!()
+    }
+
+    fn conn_async_read(&self, _sk: &mut Safekeeper) -> (&[u8], crate::bindings::PGAsyncReadResult) {
+        todo!()
+    }
+
+    fn conn_async_write(
+        &self,
+        _sk: &mut Safekeeper,
+        _buf: &[u8],
+    ) -> crate::bindings::PGAsyncWriteResult {
+        todo!()
+    }
+
+    fn conn_blocking_write(&self, _sk: &mut Safekeeper, _buf: &[u8]) -> bool {
+        todo!()
+    }
+
+    fn recovery_download(&self, _sk: &mut Safekeeper, _startpos: u64, _endpos: u64) -> bool {
+        todo!()
+    }
+
+    fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) {
+        todo!()
+    }
+
+    fn wal_reader_allocate(&self, _sk: &mut Safekeeper) {
+        todo!()
+    }
+
+    fn free_event_set(&self, _wp: &mut WalProposer) {
+        todo!()
+    }
+
+    fn init_event_set(&self, _wp: &mut WalProposer) {
+        todo!()
+    }
+
+    fn update_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
+        todo!()
+    }
+
+    fn add_safekeeper_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
+        todo!()
+    }
+
+    fn wait_event_set(&self, _wp: &mut WalProposer, _timeout_millis: i64) -> WaitResult {
+        todo!()
+    }
+
+    fn strong_random(&self, _buf: &mut [u8]) -> bool {
+        todo!()
+    }
+
+    fn get_redo_start_lsn(&self) -> u64 {
+        todo!()
+    }
+
+    fn finish_sync_safekeepers(&self, _lsn: u64) {
+        todo!()
+    }
+
+    fn process_safekeeper_feedback(&self, _wp: &mut WalProposer, _commit_lsn: u64) {
+        todo!()
+    }
+
+    fn confirm_wal_streamed(&self, _wp: &mut WalProposer, _lsn: u64) {
+        todo!()
+    }
+
+    fn log_internal(&self, _wp: &mut WalProposer, _level: Level, _msg: &str) {
+        todo!()
+    }
+
+    fn after_election(&self, _wp: &mut WalProposer) {
+        todo!()
+    }
+}
+
+pub enum WaitResult {
+    Latch,
+    Timeout,
+    Network(*mut Safekeeper, u32),
+}
+
+pub struct Config {
+    /// Tenant and timeline id
+    pub ttid: TenantTimelineId,
+    /// List of safekeepers in format `host:port`
+    pub safekeepers_list: Vec<String>,
+    /// Safekeeper reconnect timeout in milliseconds
+    pub safekeeper_reconnect_timeout: i32,
+    /// Safekeeper connection timeout in milliseconds
+    pub safekeeper_connection_timeout: i32,
+    /// walproposer mode, finish when all safekeepers are synced or subscribe
+    /// to WAL streaming
+    pub sync_safekeepers: bool,
+}
+
+/// WalProposer main struct. C methods are reexported as Rust functions.
+pub struct Wrapper {
+    wp: *mut WalProposer,
+    _safekeepers_list_vec: Vec<u8>,
+}
+
+impl Wrapper {
+    pub fn new(api: Box<dyn ApiImpl>, config: Config) -> Wrapper {
+        let neon_tenant = CString::new(config.ttid.tenant_id.to_string())
+            .unwrap()
+            .into_raw();
+        let neon_timeline = CString::new(config.ttid.timeline_id.to_string())
+            .unwrap()
+            .into_raw();
+
+        let mut safekeepers_list_vec = CString::new(config.safekeepers_list.join(","))
+            .unwrap()
+            .into_bytes_with_nul();
+        assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
+        let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut i8;
+
+        let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;
+
+        let c_config = WalProposerConfig {
+            neon_tenant,
+            neon_timeline,
+            safekeepers_list,
+            safekeeper_reconnect_timeout: config.safekeeper_reconnect_timeout,
+            safekeeper_connection_timeout: config.safekeeper_connection_timeout,
+            wal_segment_size: WAL_SEGMENT_SIZE as i32, // default 16MB
+            syncSafekeepers: config.sync_safekeepers,
+            systemId: 0,
+            pgTimeline: 1,
+            callback_data,
+        };
+        let c_config = Box::into_raw(Box::new(c_config));
+
+        let api = create_api();
+        let wp = unsafe { WalProposerCreate(c_config, api) };
+        Wrapper {
+            wp,
+            _safekeepers_list_vec: safekeepers_list_vec,
+        }
+    }
+
+    pub fn start(&self) {
+        unsafe { WalProposerStart(self.wp) }
+    }
+}
+
+impl Drop for Wrapper {
+    fn drop(&mut self) {
+        unsafe {
+            let config = (*self.wp).config;
+            drop(Box::from_raw(
+                (*config).callback_data as *mut Box<dyn ApiImpl>,
+            ));
+            drop(CString::from_raw((*config).neon_tenant));
+            drop(CString::from_raw((*config).neon_timeline));
+            drop(Box::from_raw(config));
+
+            for i in 0..(*self.wp).n_safekeepers {
+                let sk = &mut (*self.wp).safekeeper[i as usize];
+                take_vec_u8(&mut sk.inbuf);
+            }
+
+            WalProposerFree(self.wp);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{
+        cell::Cell,
+        sync::{atomic::AtomicUsize, mpsc::sync_channel},
+    };
+
+    use utils::id::TenantTimelineId;
+
+    use crate::{api_bindings::Level, walproposer::Wrapper};
+
+    use super::ApiImpl;
+
+    #[derive(Clone, Copy, Debug)]
+    struct WaitEventsData {
+        sk: *mut crate::bindings::Safekeeper,
+        event_mask: u32,
+    }
+
+    struct MockImpl {
+        // data to return from wait_event_set
+        wait_events: Cell<WaitEventsData>,
+        // walproposer->safekeeper messages
+        expected_messages: Vec<Vec<u8>>,
+        expected_ptr: AtomicUsize,
+        // safekeeper->walproposer messages
+        safekeeper_replies: Vec<Vec<u8>>,
+        replies_ptr: AtomicUsize,
+        // channel to send LSN to the main thread
+        sync_channel: std::sync::mpsc::SyncSender<u64>,
+    }
+
+    impl MockImpl {
+        fn check_walproposer_msg(&self, msg: &[u8]) {
+            let ptr = self
+                .expected_ptr
+                .fetch_add(1, std::sync::atomic::Ordering::SeqCst);
+
+            if ptr >= self.expected_messages.len() {
+                panic!("unexpected message from walproposer");
+            }
+
+            let expected_msg = &self.expected_messages[ptr];
+            assert_eq!(msg, expected_msg.as_slice());
+        }
+
+        fn next_safekeeper_reply(&self) -> &[u8] {
+            let ptr = self
+                .replies_ptr
+                .fetch_add(1, std::sync::atomic::Ordering::SeqCst);
+
+            if ptr >= self.safekeeper_replies.len() {
+                panic!("no more safekeeper replies");
+            }
+
+            &self.safekeeper_replies[ptr]
+        }
+    }
+
+    impl ApiImpl for MockImpl {
+        fn get_current_timestamp(&self) -> i64 {
+            println!("get_current_timestamp");
+            0
+        }
+
+        fn conn_status(
+            &self,
+            _: &mut crate::bindings::Safekeeper,
+        ) -> crate::bindings::WalProposerConnStatusType {
+            println!("conn_status");
+            crate::bindings::WalProposerConnStatusType_WP_CONNECTION_OK
+        }
+
+        fn conn_connect_start(&self, _: &mut crate::bindings::Safekeeper) {
+            println!("conn_connect_start");
+        }
+
+        fn conn_connect_poll(
+            &self,
+            _: &mut crate::bindings::Safekeeper,
+        ) -> crate::bindings::WalProposerConnectPollStatusType {
+            println!("conn_connect_poll");
+            crate::bindings::WalProposerConnectPollStatusType_WP_CONN_POLLING_OK
+        }
+
+        fn conn_send_query(&self, _: &mut crate::bindings::Safekeeper, query: &str) -> bool {
+            println!("conn_send_query: {}", query);
+            true
+        }
+
+        fn conn_get_query_result(
+            &self,
+            _: &mut crate::bindings::Safekeeper,
+        ) -> crate::bindings::WalProposerExecStatusType {
+            println!("conn_get_query_result");
+            crate::bindings::WalProposerExecStatusType_WP_EXEC_SUCCESS_COPYBOTH
+        }
+
+        fn conn_async_read(
+            &self,
+            _: &mut crate::bindings::Safekeeper,
+        ) -> (&[u8], crate::bindings::PGAsyncReadResult) {
+            println!("conn_async_read");
+            let reply = self.next_safekeeper_reply();
+            println!("conn_async_read result: {:?}", reply);
+            (
+                reply,
+                crate::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS,
+            )
+        }
+
+        fn conn_blocking_write(&self, _: &mut crate::bindings::Safekeeper, buf: &[u8]) -> bool {
+            println!("conn_blocking_write: {:?}", buf);
+            self.check_walproposer_msg(buf);
+            true
+        }
+
+        fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) {
+            println!("wal_reader_allocate")
+        }
+
+        fn free_event_set(&self, _: &mut crate::bindings::WalProposer) {
+            println!("free_event_set")
+        }
+
+        fn init_event_set(&self, _: &mut crate::bindings::WalProposer) {
+            println!("init_event_set")
+        }
+
+        fn update_event_set(&self, sk: &mut crate::bindings::Safekeeper, event_mask: u32) {
+            println!(
+                "update_event_set, sk={:?}, events_mask={:#b}",
+                sk as *mut crate::bindings::Safekeeper, event_mask
+            );
+            self.wait_events.set(WaitEventsData { sk, event_mask });
+        }
+
+        fn add_safekeeper_event_set(&self, sk: &mut crate::bindings::Safekeeper, event_mask: u32) {
+            println!(
+                "add_safekeeper_event_set, sk={:?}, events_mask={:#b}",
+                sk as *mut crate::bindings::Safekeeper, event_mask
+            );
+            self.wait_events.set(WaitEventsData { sk, event_mask });
+        }
+
+        fn wait_event_set(
+            &self,
+            _: &mut crate::bindings::WalProposer,
+            timeout_millis: i64,
+        ) -> super::WaitResult {
+            let data = self.wait_events.get();
+            println!(
+                "wait_event_set, timeout_millis={}, res={:?}",
+                timeout_millis, data
+            );
+            super::WaitResult::Network(data.sk, data.event_mask)
+        }
+
+        fn strong_random(&self, buf: &mut [u8]) -> bool {
+            println!("strong_random");
+            buf.fill(0);
+            true
+        }
+
+        fn finish_sync_safekeepers(&self, lsn: u64) {
+            self.sync_channel.send(lsn).unwrap();
+            panic!("sync safekeepers finished at lsn={}", lsn);
+        }
+
+        fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
+            println!("walprop_log[{}] {}", level, msg);
+        }
+
+        fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {
+            println!("after_election");
+        }
+    }
+
+    /// Test that walproposer can successfully connect to safekeeper and finish
+    /// sync_safekeepers. API is mocked in MockImpl.
+    ///
+    /// Run this test with valgrind to detect leaks:
+    /// `valgrind --leak-check=full target/debug/deps/walproposer-<build>`
+    #[test]
+    fn test_simple_sync_safekeepers() -> anyhow::Result<()> {
+        let ttid = TenantTimelineId::new(
+            "9e4c8f36063c6c6e93bc20d65a820f3d".parse()?,
+            "9e4c8f36063c6c6e93bc20d65a820f3d".parse()?,
+        );
+
+        let (sender, receiver) = sync_channel(1);
+
+        let my_impl: Box<dyn ApiImpl> = Box::new(MockImpl {
+            wait_events: Cell::new(WaitEventsData {
+                sk: std::ptr::null_mut(),
+                event_mask: 0,
+            }),
+            expected_messages: vec![
+                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160000, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
+                vec![
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
+                    147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
+                    188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
+                ],
+                // VoteRequest(VoteRequest { term: 3 })
+                vec![
+                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0,
+                ],
+            ],
+            expected_ptr: AtomicUsize::new(0),
+            safekeeper_replies: vec![
+                // Greeting(AcceptorGreeting { term: 2, node_id: NodeId(1) })
+                vec![
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+                ],
+                // VoteResponse(VoteResponse { term: 3, vote_given: 1, flush_lsn: 0/539, truncate_lsn: 0/539, term_history: [(2, 0/539)], timeline_start_lsn: 0/539 })
+                vec![
+                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 57,
+                    5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
+                    0, 57, 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0,
+                ],
+            ],
+            replies_ptr: AtomicUsize::new(0),
+            sync_channel: sender,
+        });
+        let config = crate::walproposer::Config {
+            ttid,
+            safekeepers_list: vec!["localhost:5000".to_string()],
+            safekeeper_reconnect_timeout: 1000,
+            safekeeper_connection_timeout: 10000,
+            sync_safekeepers: true,
+        };
+
+        let wp = Wrapper::new(my_impl, config);
+
+        // walproposer will panic when it finishes sync_safekeepers
+        std::panic::catch_unwind(|| wp.start()).unwrap_err();
+        // validate the resulting LSN
+        assert_eq!(receiver.recv()?, 1337);
+        Ok(())
+        // drop() will free up resources here
+    }
+}
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -11,10 +11,7 @@ use std::sync::{Arc, Barrier};

 use bytes::{Buf, Bytes};
 use pageserver::{
-    config::PageServerConf,
-    repository::Key,
-    walrecord::NeonWalRecord,
-    walredo::{PostgresRedoManager, WalRedoError},
+    config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
 };
 use utils::{id::TenantId, lsn::Lsn};

@@ -35,9 +32,15 @@ fn redo_scenarios(c: &mut Criterion) {

    let manager = Arc::new(manager);

-    tracing::info!("executing first");
-    short().execute(&manager).unwrap();
-    tracing::info!("first executed");
+    {
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .unwrap();
+        tracing::info!("executing first");
+        short().execute(rt.handle(), &manager).unwrap();
+        tracing::info!("first executed");
+    }

    let thread_counts = [1, 2, 4, 8, 16];

@@ -80,9 +83,14 @@ fn add_multithreaded_walredo_requesters(
    assert_ne!(threads, 0);

    if threads == 1 {
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .unwrap();
+        let handle = rt.handle();
        b.iter_batched_ref(
            || Some(input_factory()),
-            |input| execute_all(input.take(), manager),
+            |input| execute_all(input.take(), handle, manager),
            criterion::BatchSize::PerIteration,
        );
    } else {
@@ -98,19 +106,26 @@ fn add_multithreaded_walredo_requesters(
                    let manager = manager.clone();
                    let barrier = barrier.clone();
                    let work_rx = work_rx.clone();
-                    move || loop {
-                        // queue up and wait if we want to go another round
-                        if work_rx.lock().unwrap().recv().is_err() {
-                            break;
+                    move || {
+                        let rt = tokio::runtime::Builder::new_current_thread()
+                            .enable_all()
+                            .build()
+                            .unwrap();
+                        let handle = rt.handle();
+                        loop {
+                            // queue up and wait if we want to go another round
+                            if work_rx.lock().unwrap().recv().is_err() {
+                                break;
+                            }
+
+                            let input = Some(input_factory());
+
+                            barrier.wait();
+
+                            execute_all(input, handle, &manager).unwrap();
+
+                            barrier.wait();
                        }
-
-                        let input = Some(input_factory());
-
-                        barrier.wait();
-
-                        execute_all(input, &manager).unwrap();
-
-                        barrier.wait();
                    }
                })
            })
@@ -152,15 +167,19 @@ impl Drop for JoinOnDrop {
    }
 }

-fn execute_all<I>(input: I, manager: &PostgresRedoManager) -> Result<(), WalRedoError>
+fn execute_all<I>(
+    input: I,
+    handle: &tokio::runtime::Handle,
+    manager: &PostgresRedoManager,
+) -> anyhow::Result<()>
 where
    I: IntoIterator<Item = Request>,
 {
    // just fire all requests as fast as possible
    input.into_iter().try_for_each(|req| {
-        let page = req.execute(manager)?;
+        let page = req.execute(handle, manager)?;
        assert_eq!(page.remaining(), 8192);
-        Ok::<_, WalRedoError>(())
+        anyhow::Ok(())
    })
 }

@@ -473,9 +492,11 @@ struct Request {
 }

 impl Request {
-    fn execute(self, manager: &PostgresRedoManager) -> Result<Bytes, WalRedoError> {
-        use pageserver::walredo::WalRedoManager;
-
+    fn execute(
+        self,
+        rt: &tokio::runtime::Handle,
+        manager: &PostgresRedoManager,
+    ) -> anyhow::Result<Bytes> {
        let Request {
            key,
            lsn,
@@ -484,6 +505,6 @@ impl Request {
            pg_version,
        } = self;

-        manager.request_redo(key, lsn, base_img, records, pg_version)
+        rt.block_on(manager.request_redo(key, lsn, base_img, records, pg_version))
    }
 }
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,6 +13,7 @@
 use anyhow::{anyhow, bail, ensure, Context};
 use bytes::{BufMut, BytesMut};
 use fail::fail_point;
+use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
 use tokio::io;
@@ -180,6 +181,7 @@ where
            }
        }

+        let mut min_restart_lsn: Lsn = Lsn::MAX;
        // Create tablespace directories
        for ((spcnode, dbnode), has_relmap_file) in
            self.timeline.list_dbdirs(self.lsn, self.ctx).await?
@@ -213,6 +215,34 @@ where
                    self.add_rel(rel, rel).await?;
                }
            }
+
+            for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? {
+                if path.starts_with("pg_replslot") {
+                    let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
+                    let restart_lsn = Lsn(u64::from_le_bytes(
+                        content[offs..offs + 8].try_into().unwrap(),
+                    ));
+                    info!("Replication slot {} restart LSN={}", path, restart_lsn);
+                    min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
+                }
+                let header = new_tar_header(&path, content.len() as u64)?;
+                self.ar
+                    .append(&header, &*content)
+                    .await
+                    .context("could not add aux file to basebackup tarball")?;
+            }
+        }
+        if min_restart_lsn != Lsn::MAX {
+            info!(
+                "Min restart LSN for logical replication is {}",
+                min_restart_lsn
+            );
+            let data = min_restart_lsn.0.to_le_bytes();
+            let header = new_tar_header("restart.lsn", data.len() as u64)?;
+            self.ar
+                .append(&header, &data[..])
+                .await
+                .context("could not add restart.lsn file to basebackup tarball")?;
        }
        for xid in self
            .timeline
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -2,6 +2,7 @@

 use std::env::{var, VarError};
 use std::sync::Arc;
+use std::time::Duration;
 use std::{env, ops::ControlFlow, str::FromStr};

 use anyhow::{anyhow, Context};
@@ -33,11 +34,12 @@ use postgres_backend::AuthType;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::signals::ShutdownSignals;
 use utils::{
-    auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal,
-    tcp_listener,
+    auth::JwtAuth, logging, project_build_tag, project_git_version, sentry_init::init_sentry,
+    signals::Signal, tcp_listener,
 };

 project_git_version!(GIT_VERSION);
+project_build_tag!(BUILD_TAG);

 const PID_FILE_NAME: &str = "pageserver.pid";

@@ -200,6 +202,51 @@ fn initialize_config(
    })
 }

+struct WaitForPhaseResult<F: std::future::Future + Unpin> {
+    timeout_remaining: Duration,
+    skipped: Option<F>,
+}
+
+/// During startup, we apply a timeout to our waits for readiness, to avoid
+/// stalling the whole service if one Tenant experiences some problem.  Each
+/// phase may consume some of the timeout: this function returns the updated
+/// timeout for use in the next call.
+async fn wait_for_phase<F>(phase: &str, mut fut: F, timeout: Duration) -> WaitForPhaseResult<F>
+where
+    F: std::future::Future + Unpin,
+{
+    let initial_t = Instant::now();
+    let skipped = match tokio::time::timeout(timeout, &mut fut).await {
+        Ok(_) => None,
+        Err(_) => {
+            tracing::info!(
+                timeout_millis = timeout.as_millis(),
+                %phase,
+                "Startup phase timed out, proceeding anyway"
+            );
+            Some(fut)
+        }
+    };
+
+    WaitForPhaseResult {
+        timeout_remaining: timeout
+            .checked_sub(Instant::now().duration_since(initial_t))
+            .unwrap_or(Duration::ZERO),
+        skipped,
+    }
+}
+
+fn startup_checkpoint(started_at: Instant, phase: &str, human_phase: &str) {
+    let elapsed = started_at.elapsed();
+    let secs = elapsed.as_secs_f64();
+    STARTUP_DURATION.with_label_values(&[phase]).set(secs);
+
+    info!(
+        elapsed_ms = elapsed.as_millis(),
+        "{human_phase} ({secs:.3}s since start)"
+    )
+}
+
 fn start_pageserver(
    launch_ts: &'static LaunchTimestamp,
    conf: &'static PageServerConf,
@@ -207,26 +254,17 @@ fn start_pageserver(
    // Monotonic time for later calculating startup duration
    let started_startup_at = Instant::now();

-    let startup_checkpoint = move |phase: &str, human_phase: &str| {
-        let elapsed = started_startup_at.elapsed();
-        let secs = elapsed.as_secs_f64();
-        STARTUP_DURATION.with_label_values(&[phase]).set(secs);
-        info!(
-            elapsed_ms = elapsed.as_millis(),
-            "{human_phase} ({secs:.3}s since start)"
-        )
-    };
-
    // Print version and launch timestamp to the log,
    // and expose them as prometheus metrics.
    // A changed version string indicates changed software.
    // A changed launch timestamp indicates a pageserver restart.
    info!(
-        "version: {} launch_timestamp: {}",
+        "version: {} launch_timestamp: {} build_tag: {}",
        version(),
-        launch_ts.to_string()
+        launch_ts.to_string(),
+        BUILD_TAG,
    );
-    set_build_info_metric(GIT_VERSION);
+    set_build_info_metric(GIT_VERSION, BUILD_TAG);
    set_launch_timestamp_metric(launch_ts);
    pageserver::preinitialize_metrics();

@@ -341,7 +379,7 @@ fn start_pageserver(

    // Up to this point no significant I/O has been done: this should have been fast.  Record
    // duration prior to starting I/O intensive phase of startup.
-    startup_checkpoint("initial", "Starting loading tenants");
+    startup_checkpoint(started_startup_at, "initial", "Starting loading tenants");
    STARTUP_IS_LOADING.set(1);

    // Startup staging or optimizing:
@@ -355,6 +393,7 @@ fn start_pageserver(
    // consumer side) will be dropped once we can start the background jobs. Currently it is behind
    // completing all initial logical size calculations (init_logical_size_done_rx) and a timeout
    // (background_task_maximum_delay).
+    let (init_remote_done_tx, init_remote_done_rx) = utils::completion::channel();
    let (init_done_tx, init_done_rx) = utils::completion::channel();

    let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
@@ -362,7 +401,8 @@ fn start_pageserver(
    let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();

    let order = pageserver::InitializationOrder {
-        initial_tenant_load: Some(init_done_tx),
+        initial_tenant_load_remote: Some(init_done_tx),
+        initial_tenant_load: Some(init_remote_done_tx),
        initial_logical_size_can_start: init_done_rx.clone(),
        initial_logical_size_attempt: Some(init_logical_size_done_tx),
        background_jobs_can_start: background_jobs_barrier.clone(),
@@ -386,55 +426,93 @@ fn start_pageserver(
        let shutdown_pageserver = shutdown_pageserver.clone();
        let drive_init = async move {
            // NOTE: unlike many futures in pageserver, this one is cancellation-safe
-            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));
+            let guard = scopeguard::guard_on_success((), |_| {
+                tracing::info!("Cancelled before initial load completed")
+            });

-            init_done_rx.wait().await;
-            startup_checkpoint("initial_tenant_load", "Initial load completed");
-            STARTUP_IS_LOADING.set(0);
+            let timeout = conf.background_task_maximum_delay;
+
+            let init_remote_done = std::pin::pin!(async {
+                init_remote_done_rx.wait().await;
+                startup_checkpoint(
+                    started_startup_at,
+                    "initial_tenant_load_remote",
+                    "Remote part of initial load completed",
+                );
+            });
+
+            let WaitForPhaseResult {
+                timeout_remaining: timeout,
+                skipped: init_remote_skipped,
+            } = wait_for_phase("initial_tenant_load_remote", init_remote_done, timeout).await;
+
+            let init_load_done = std::pin::pin!(async {
+                init_done_rx.wait().await;
+                startup_checkpoint(
+                    started_startup_at,
+                    "initial_tenant_load",
+                    "Initial load completed",
+                );
+                STARTUP_IS_LOADING.set(0);
+            });
+
+            let WaitForPhaseResult {
+                timeout_remaining: timeout,
+                skipped: init_load_skipped,
+            } = wait_for_phase("initial_tenant_load", init_load_done, timeout).await;

            // initial logical sizes can now start, as they were waiting on init_done_rx.

            scopeguard::ScopeGuard::into_inner(guard);

-            let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());
+            let guard = scopeguard::guard_on_success((), |_| {
+                tracing::info!("Cancelled before initial logical sizes completed")
+            });

-            let timeout = conf.background_task_maximum_delay;
+            let logical_sizes_done = std::pin::pin!(async {
+                init_logical_size_done_rx.wait().await;
+                startup_checkpoint(
+                    started_startup_at,
+                    "initial_logical_sizes",
+                    "Initial logical sizes completed",
+                );
+            });

-            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
-
-            let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
-                Ok(_) => {
-                    startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed");
-                    None
-                }
-                Err(_) => {
-                    tracing::info!(
-                        timeout_millis = timeout.as_millis(),
-                        "Initial logical size timeout elapsed; starting background jobs"
-                    );
-                    Some(init_sizes_done)
-                }
-            };
+            let WaitForPhaseResult {
+                timeout_remaining: _,
+                skipped: logical_sizes_skipped,
+            } = wait_for_phase("initial_logical_sizes", logical_sizes_done, timeout).await;

            scopeguard::ScopeGuard::into_inner(guard);

-            // allow background jobs to start
+            // allow background jobs to start: we either completed prior stages, or they reached timeout
+            // and were skipped.  It is important that we do not let them block background jobs indefinitely,
+            // because things like consumption metrics for billing are blocked by this barrier.
            drop(background_jobs_can_start);
-            startup_checkpoint("background_jobs_can_start", "Starting background jobs");
-
-            if let Some(init_sizes_done) = init_sizes_done {
-                // ending up here is not a bug; at the latest logical sizes will be queried by
-                // consumption metrics.
-                let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
-                init_sizes_done.await;
-
-                scopeguard::ScopeGuard::into_inner(guard);
-
-                startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed after timeout (background jobs already started)");
+            startup_checkpoint(
+                started_startup_at,
+                "background_jobs_can_start",
+                "Starting background jobs",
+            );

+            // We are done. If we skipped any phases due to timeout, run them to completion here so that
+            // they will eventually update their startup_checkpoint, and so that we do not declare the
+            // 'complete' stage until all the other stages are really done.
+            let guard = scopeguard::guard_on_success((), |_| {
+                tracing::info!("Cancelled before waiting for skipped phases done")
+            });
+            if let Some(f) = init_remote_skipped {
+                f.await;
            }
+            if let Some(f) = init_load_skipped {
+                f.await;
+            }
+            if let Some(f) = logical_sizes_skipped {
+                f.await;
+            }
+            scopeguard::ScopeGuard::into_inner(guard);

-            startup_checkpoint("complete", "Startup complete");
+            startup_checkpoint(started_startup_at, "complete", "Startup complete");
        };

        async move {
@@ -574,6 +652,7 @@ fn start_pageserver(
                    pageserver_listener,
                    conf.pg_auth_type,
                    libpq_ctx,
+                    task_mgr::shutdown_token(),
                )
                .await
            },
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -33,8 +33,7 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{
-    TENANTS_SEGMENT_NAME, TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME,
-    TIMELINES_SEGMENT_NAME,
+    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
 use crate::{
    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
@@ -633,11 +632,6 @@ impl PageServerConf {
        self.tenants_path().join(tenant_id.to_string())
    }

-    pub fn tenant_attaching_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
-        self.tenant_path(tenant_id)
-            .join(TENANT_ATTACHING_MARKER_FILENAME)
-    }
-
    pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
        self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME)
    }
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -2,6 +2,7 @@
 //! and push them to a HTTP endpoint.
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
+use crate::tenant::tasks::BackgroundLoopKind;
 use crate::tenant::{mgr, LogicalSizeCalculationCause};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
@@ -10,6 +11,7 @@ use reqwest::Url;
 use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, SystemTime};
+use tokio::time::Instant;
 use tracing::*;
 use utils::id::NodeId;

@@ -87,22 +89,12 @@ pub async fn collect_metrics(

    let node_id = node_id.to_string();

-    // reminder: ticker is ready immediatedly
-    let mut ticker = tokio::time::interval(metric_collection_interval);
-
    loop {
-        let tick_at = tokio::select! {
-            _ = cancel.cancelled() => return Ok(()),
-            tick_at = ticker.tick() => tick_at,
-        };
+        let started_at = Instant::now();

        // these are point in time, with variable "now"
        let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;

-        if metrics.is_empty() {
-            continue;
-        }
-
        let metrics = Arc::new(metrics);

        // why not race cancellation here? because we are one of the last tasks, and if we are
@@ -141,10 +133,19 @@ pub async fn collect_metrics(
        let (_, _) = tokio::join!(flush, upload);

        crate::tenant::tasks::warn_when_period_overrun(
-            tick_at.elapsed(),
+            started_at.elapsed(),
            metric_collection_interval,
-            "consumption_metrics_collect_metrics",
+            BackgroundLoopKind::ConsumptionMetricsCollectMetrics,
        );
+
+        let res = tokio::time::timeout_at(
+            started_at + metric_collection_interval,
+            task_mgr::shutdown_token().cancelled(),
+        )
+        .await;
+        if res.is_ok() {
+            return Ok(());
+        }
    }
 }

@@ -243,16 +244,14 @@ async fn calculate_synthetic_size_worker(
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
    info!("starting calculate_synthetic_size_worker");
+    scopeguard::defer! {
+        info!("calculate_synthetic_size_worker stopped");
+    };

-    // reminder: ticker is ready immediatedly
-    let mut ticker = tokio::time::interval(synthetic_size_calculation_interval);
    let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;

    loop {
-        let tick_at = tokio::select! {
-            _ = task_mgr::shutdown_watcher() => return Ok(()),
-            tick_at = ticker.tick() => tick_at,
-        };
+        let started_at = Instant::now();

        let tenants = match mgr::list_tenants().await {
            Ok(tenants) => tenants,
@@ -268,6 +267,11 @@ async fn calculate_synthetic_size_worker(
            }

            if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
+                // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
+                // We can put in some prioritization for consumption metrics.
+                // Same for the loop that fetches computed metrics.
+                // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
+                // which turns out is really handy to understand the system.
                if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await {
                    error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
                }
@@ -275,9 +279,18 @@ async fn calculate_synthetic_size_worker(
        }

        crate::tenant::tasks::warn_when_period_overrun(
-            tick_at.elapsed(),
+            started_at.elapsed(),
            synthetic_size_calculation_interval,
-            "consumption_metrics_synthetic_size_worker",
+            BackgroundLoopKind::ConsumptionMetricsSyntheticSizeWorker,
        );
+
+        let res = tokio::time::timeout_at(
+            started_at + synthetic_size_calculation_interval,
+            task_mgr::shutdown_token().cancelled(),
+        )
+        .await;
+        if res.is_ok() {
+            return Ok(());
+        }
    }
 }
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -57,7 +57,10 @@ impl ControlPlaneClient {

        if let Some(jwt) = &conf.control_plane_api_token {
            let mut headers = hyper::HeaderMap::new();
-            headers.insert("Authorization", jwt.get_contents().parse().unwrap());
+            headers.insert(
+                "Authorization",
+                format!("Bearer {}", jwt.get_contents()).parse().unwrap(),
+            );
            client = client.default_headers(headers);
        }

@@ -144,7 +147,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
        Ok(response
            .tenants
            .into_iter()
-            .map(|t| (t.id, Generation::new(t.generation)))
+            .map(|t| (t.id, Generation::new(t.gen)))
            .collect::<HashMap<_, _>>())
    }

--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -10,6 +10,7 @@ use crate::control_plane_client::ControlPlaneGenerationsApi;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::remote_timeline_path;
+use crate::virtual_file::MaybeFatalIo;
 use crate::virtual_file::VirtualFile;
 use anyhow::Context;
 use camino::Utf8PathBuf;
@@ -271,7 +272,9 @@ impl DeletionHeader {
        let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
            .await
-            .map_err(Into::into)
+            .maybe_fatal_err("save deletion header")?;
+
+        Ok(())
    }
 }

@@ -360,6 +363,7 @@ impl DeletionList {
        let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
        VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
            .await
+            .maybe_fatal_err("save deletion list")
            .map_err(Into::into)
    }
 }
@@ -1298,10 +1302,6 @@ pub(crate) mod mock {
            }
        }

-        pub fn get_executed(&self) -> usize {
-            self.executed.load(Ordering::Relaxed)
-        }
-
        #[allow(clippy::await_holding_lock)]
        pub async fn pump(&self) {
            if let Some(remote_storage) = &self.remote_storage {
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -34,6 +34,8 @@ use crate::deletion_queue::TEMP_SUFFIX;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::storage_layer::LayerFileName;
+use crate::virtual_file::on_fatal_io_error;
+use crate::virtual_file::MaybeFatalIo;

 // The number of keys in a DeletionList before we will proactively persist it
 // (without reaching a flush deadline).  This aims to deliver objects of the order
@@ -195,7 +197,7 @@ impl ListWriter {
                    debug!("Deletion header {header_path} not found, first start?");
                    Ok(None)
                } else {
-                    Err(anyhow::anyhow!(e))
+                    on_fatal_io_error(&e, "reading deletion header");
                }
            }
        }
@@ -216,16 +218,9 @@ impl ListWriter {
        self.pending.sequence = validated_sequence + 1;

        let deletion_directory = self.conf.deletion_prefix();
-        let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
-            Ok(d) => d,
-            Err(e) => {
-                warn!("Failed to open deletion list directory {deletion_directory}: {e:#}");
-
-                // Give up: if we can't read the deletion list directory, we probably can't
-                // write lists into it later, so the queue won't work.
-                return Err(e.into());
-            }
-        };
+        let mut dir = tokio::fs::read_dir(&deletion_directory)
+            .await
+            .fatal_err("read deletion directory");

        let list_name_pattern =
            Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
@@ -233,7 +228,7 @@ impl ListWriter {
        let temp_extension = format!(".{TEMP_SUFFIX}");
        let header_path = self.conf.deletion_header_path();
        let mut seqs: Vec<u64> = Vec::new();
-        while let Some(dentry) = dir.next_entry().await? {
+        while let Some(dentry) = dir.next_entry().await.fatal_err("read deletion dentry") {
            let file_name = dentry.file_name();
            let dentry_str = file_name.to_string_lossy();

@@ -246,11 +241,9 @@ impl ListWriter {
                info!("Cleaning up temporary file {dentry_str}");
                let absolute_path =
                    deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path"));
-                if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
-                    // Non-fatal error: we will just leave the file behind but not
-                    // try and load it.
-                    warn!("Failed to clean up temporary file {absolute_path}: {e:#}");
-                }
+                tokio::fs::remove_file(&absolute_path)
+                    .await
+                    .fatal_err("delete temp file");

                continue;
            }
@@ -290,7 +283,9 @@ impl ListWriter {
        for s in seqs {
            let list_path = self.conf.deletion_list_path(s);

-            let list_bytes = tokio::fs::read(&list_path).await?;
+            let list_bytes = tokio::fs::read(&list_path)
+                .await
+                .fatal_err("read deletion list");

            let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
                Ok(l) => l,
@@ -349,7 +344,7 @@ impl ListWriter {
        info!("Started deletion frontend worker");

        // Synchronous, but we only do it once per process lifetime so it's tolerable
-        if let Err(e) = create_dir_all(&self.conf.deletion_prefix()) {
+        if let Err(e) = create_dir_all(self.conf.deletion_prefix()) {
            tracing::error!(
                "Failed to create deletion list directory {}, deletions will not be executed ({e})",
                self.conf.deletion_prefix(),
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -28,6 +28,7 @@ use crate::config::PageServerConf;
 use crate::control_plane_client::ControlPlaneGenerationsApi;
 use crate::control_plane_client::RetryForeverError;
 use crate::metrics;
+use crate::virtual_file::MaybeFatalIo;

 use super::deleter::DeleterMessage;
 use super::DeletionHeader;
@@ -287,16 +288,9 @@ where
    async fn cleanup_lists(&mut self, list_paths: Vec<Utf8PathBuf>) {
        for list_path in list_paths {
            debug!("Removing deletion list {list_path}");
-
-            if let Err(e) = tokio::fs::remove_file(&list_path).await {
-                // Unexpected: we should have permissions and nothing else should
-                // be touching these files.  We will leave the file behind.  Subsequent
-                // pageservers will try and load it again: hopefully whatever storage
-                // issue (probably permissions) has been fixed by then.
-                tracing::error!("Failed to delete {list_path}: {e:#}");
-                metrics::DELETION_QUEUE.unexpected_errors.inc();
-                break;
-            }
+            tokio::fs::remove_file(&list_path)
+                .await
+                .fatal_err("remove deletion list");
        }
    }

--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -60,7 +60,11 @@ use utils::serde_percent::Percent;
 use crate::{
    config::PageServerConf,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
-    tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
+    tenant::{
+        self,
+        storage_layer::{AsLayerDesc, EvictionError, Layer},
+        Timeline,
+    },
 };

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -108,7 +112,7 @@ pub fn launch_disk_usage_global_eviction_task(
                _ = background_jobs_barrier.wait() => { }
            };

-            disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
+            disk_usage_eviction_task(&state, task_config, &storage, &conf.tenants_path(), cancel)
                .await;
            Ok(())
        },
@@ -121,7 +125,7 @@ pub fn launch_disk_usage_global_eviction_task(
 async fn disk_usage_eviction_task(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    storage: GenericRemoteStorage,
+    _storage: &GenericRemoteStorage,
    tenants_dir: &Utf8Path,
    cancel: CancellationToken,
 ) {
@@ -145,14 +149,8 @@ async fn disk_usage_eviction_task(
        let start = Instant::now();

        async {
-            let res = disk_usage_eviction_task_iteration(
-                state,
-                task_config,
-                &storage,
-                tenants_dir,
-                &cancel,
-            )
-            .await;
+            let res =
+                disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await;

            match res {
                Ok(()) => {}
@@ -183,13 +181,12 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
 async fn disk_usage_eviction_task_iteration(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    storage: &GenericRemoteStorage,
    tenants_dir: &Utf8Path,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
        .context("get filesystem-level disk usage before evictions")?;
-    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
+    let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await;
    match res {
        Ok(outcome) => {
            debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -273,7 +270,6 @@ struct LayerCount {

 pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    state: &State,
-    storage: &GenericRemoteStorage,
    usage_pre: U,
    cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
@@ -330,9 +326,10 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // If we get far enough in the list that we start to evict layers that are below
    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
    // usage at that point, in 'usage_planned_min_resident_size_respecting'.
-    let mut batched: HashMap<_, Vec<Arc<dyn PersistentLayer>>> = HashMap::new();
+    let mut batched: HashMap<_, Vec<_>> = HashMap::new();
    let mut warned = None;
    let mut usage_planned = usage_pre;
+    let mut max_batch_size = 0;
    for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
        if !usage_planned.has_pressure() {
            debug!(
@@ -349,10 +346,18 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

        usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);

-        batched
-            .entry(TimelineKey(candidate.timeline))
-            .or_default()
-            .push(candidate.layer);
+        // FIXME: batching makes no sense anymore because of no layermap locking, should just spawn
+        // tasks to evict all seen layers until we have evicted enough
+
+        let batch = batched.entry(TimelineKey(candidate.timeline)).or_default();
+
+        // semaphore will later be used to limit eviction concurrency, and we can express at
+        // most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
+        // but fail gracefully by not making batches larger.
+        if batch.len() < u32::MAX as usize {
+            batch.push(candidate.layer);
+            max_batch_size = max_batch_size.max(batch.len());
+        }
    }

    let usage_planned = match warned {
@@ -369,64 +374,101 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

    // phase2: evict victims batched by timeline

-    // After the loop, `usage_assumed` is the post-eviction usage,
-    // according to internal accounting.
-    let mut usage_assumed = usage_pre;
-    let mut evictions_failed = LayerCount::default();
+    let mut js = tokio::task::JoinSet::new();
+
+    // ratelimit to 1k files or any higher max batch size
+    let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
+
    for (timeline, batch) in batched {
        let tenant_id = timeline.tenant_id;
        let timeline_id = timeline.timeline_id;
-        let batch_size = batch.len();
+        let batch_size =
+            u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
+
+        // I dislike naming of `available_permits` but it means current total amount of permits
+        // because permits can be added
+        assert!(batch_size as usize <= limit.available_permits());

        debug!(%timeline_id, "evicting batch for timeline");

-        async {
-            let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;
+        let evict = {
+            let limit = limit.clone();
+            let cancel = cancel.clone();
+            async move {
+                let mut evicted_bytes = 0;
+                let mut evictions_failed = LayerCount::default();

-            match results {
-                Err(e) => {
-                    warn!("failed to evict batch: {:#}", e);
-                }
-                Ok(results) => {
-                    assert_eq!(results.len(), batch.len());
-                    for (result, layer) in results.into_iter().zip(batch.iter()) {
-                        let file_size = layer.layer_desc().file_size;
-                        match result {
-                            Some(Ok(())) => {
-                                usage_assumed.add_available_bytes(file_size);
-                            }
-                            Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
-                                unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
-                            }
-                            Some(Err(EvictionError::FileNotFound)) => {
-                                evictions_failed.file_sizes += file_size;
-                                evictions_failed.count += 1;
-                            }
-                            Some(Err(
-                                e @ EvictionError::LayerNotFound(_)
-                                | e @ EvictionError::StatFailed(_),
-                            )) => {
-                                let e = utils::error::report_compact_sources(&e);
-                                warn!(%layer, "failed to evict layer: {e}");
-                                evictions_failed.file_sizes += file_size;
-                                evictions_failed.count += 1;
-                            }
-                            None => {
-                                assert!(cancel.is_cancelled());
-                                return;
+                let Ok(_permit) = limit.acquire_many_owned(batch_size).await else {
+                    // semaphore closing means cancelled
+                    return (evicted_bytes, evictions_failed);
+                };
+
+                let results = timeline.evict_layers(&batch, &cancel).await;
+
+                match results {
+                    Ok(results) => {
+                        assert_eq!(results.len(), batch.len());
+                        for (result, layer) in results.into_iter().zip(batch.iter()) {
+                            let file_size = layer.layer_desc().file_size;
+                            match result {
+                                Some(Ok(())) => {
+                                    evicted_bytes += file_size;
+                                }
+                                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
+                                    evictions_failed.file_sizes += file_size;
+                                    evictions_failed.count += 1;
+                                }
+                                None => {
+                                    assert!(cancel.is_cancelled());
+                                }
                            }
                        }
                    }
+                    Err(e) => {
+                        warn!("failed to evict batch: {:#}", e);
+                    }
                }
+                (evicted_bytes, evictions_failed)
            }
        }
-        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
-        .await;
+        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size));

-        if cancel.is_cancelled() {
+        js.spawn(evict);
+
+        // spwaning multiple thousands of these is essentially blocking, so give already spawned a
+        // chance of making progress
+        tokio::task::yield_now().await;
+    }
+
+    let join_all = async move {
+        // After the evictions, `usage_assumed` is the post-eviction usage,
+        // according to internal accounting.
+        let mut usage_assumed = usage_pre;
+        let mut evictions_failed = LayerCount::default();
+
+        while let Some(res) = js.join_next().await {
+            match res {
+                Ok((evicted_bytes, failed)) => {
+                    usage_assumed.add_available_bytes(evicted_bytes);
+                    evictions_failed.file_sizes += failed.file_sizes;
+                    evictions_failed.count += failed.count;
+                }
+                Err(je) if je.is_cancelled() => unreachable!("not used"),
+                Err(je) if je.is_panic() => { /* already logged */ }
+                Err(je) => tracing::error!("unknown JoinError: {je:?}"),
+            }
+        }
+        (usage_assumed, evictions_failed)
+    };
+
+    let (usage_assumed, evictions_failed) = tokio::select! {
+        tuple = join_all => { tuple },
+        _ = cancel.cancelled() => {
+            // close the semaphore to stop any pending acquires
+            limit.close();
            return Ok(IterationOutcome::Cancelled);
        }
-    }
+    };

    Ok(IterationOutcome::Finished(IterationOutcomeFinished {
        before: usage_pre,
@@ -441,7 +483,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
 #[derive(Clone)]
 struct EvictionCandidate {
    timeline: Arc<Timeline>,
-    layer: Arc<dyn PersistentLayer>,
+    layer: Layer,
    last_activity_ts: SystemTime,
 }

--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -306,6 +306,67 @@ paths:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"

+  /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    get:
+      description: Get timestamp for a given LSN
+      parameters:
+        - name: lsn
+          in: query
+          required: true
+          schema:
+            type: integer
+          description: A LSN to get the timestamp
+      responses:
+        "200":
+          description: OK
+          content:
+            application/json:
+              schema:
+                type: string
+                format: date-time
+        "400":
+          description: Error when no tenant id found in path, no timeline id or invalid timestamp
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "404":
+          description: Timeline not found, or there is no timestamp information for the given lsn
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/NotFoundError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"

  /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
    parameters:
@@ -331,13 +392,19 @@ paths:
            type: string
            format: date-time
          description: A timestamp to get the LSN
+        - name: version
+          in: query
+          required: false
+          schema:
+            type: integer
+          description: The version of the endpoint to use
      responses:
        "200":
          description: OK
          content:
            application/json:
              schema:
-                type: string
+                $ref: "#/components/schemas/LsnByTimestampResponse"
        "400":
          description: Error when no tenant id found in path, no timeline id or invalid timestamp
          content:
@@ -502,7 +569,17 @@ paths:
              schema:
                $ref: "#/components/schemas/NotFoundError"
        "409":
-          description: Tenant download is already in progress
+          description: |
+            The tenant is already known to Pageserver in some way,
+            and hence this `/attach` call has been rejected.
+
+            Some examples of how this can happen:
+            - tenant was created on this pageserver
+            - tenant attachment was started by an earlier call to `/attach`.
+
+            Callers should poll the tenant status's `attachment_status` field,
+            like for status 202. See the longer description for `POST /attach`
+            for details.
          content:
            application/json:
              schema:
@@ -646,6 +723,12 @@ paths:

        Errors if the tenant is absent on disk, already present in memory or fails to schedule its load.
        Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness.
+      requestBody:
+        required: false
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/TenantLoadRequest"
      responses:
        "202":
          description: Tenant scheduled to load successfully
@@ -1136,6 +1219,15 @@ components:
            new_tenant_id:
              type: string
              format: hex
+            generation:
+              type: integer
+              description: Attachment generation number.
+    TenantLoadRequest:
+      type: object
+      properties:
+        generation:
+          type: integer
+          description: Attachment generation number.
    TenantAttachRequest:
      type: object
      required:
@@ -1323,6 +1415,19 @@ components:
          type: string
          format: hex

+    LsnByTimestampResponse:
+      type: object
+      required:
+        - lsn
+        - kind
+      properties:
+        lsn:
+          type: string
+          format: hex
+        kind:
+          type: string
+          enum: [past, present, future, nodata]
+
    Error:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2,11 +2,13 @@
 //! Management HTTP API
 //!
 use std::collections::HashMap;
+use std::str::FromStr;
 use std::sync::Arc;

 use anyhow::{anyhow, Context, Result};
 use futures::TryFutureExt;
-use hyper::header::CONTENT_TYPE;
+use humantime::format_rfc3339;
+use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
@@ -15,6 +17,7 @@ use pageserver_api::models::{
    TenantLoadRequest, TenantLocationConfigRequest,
 };
 use remote_storage::GenericRemoteStorage;
+use serde_with::{serde_as, DisplayFromStr};
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -136,9 +139,7 @@ impl From<PageReconstructError> for ApiError {
            PageReconstructError::AncestorStopping(_) => {
                ApiError::ResourceUnavailable(format!("{pre}").into())
            }
-            PageReconstructError::WalRedo(pre) => {
-                ApiError::InternalServerError(anyhow::Error::new(pre))
-            }
+            PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
        }
    }
 }
@@ -484,6 +485,8 @@ async fn get_lsn_by_timestamp_handler(
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

+    let version: Option<u8> = parse_query_param(&request, "version")?;
+
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let timestamp_raw = must_get_query_param(&request, "timestamp")?;
    let timestamp = humantime::parse_rfc3339(&timestamp_raw)
@@ -495,13 +498,59 @@ async fn get_lsn_by_timestamp_handler(
    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
    let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;

-    let result = match result {
-        LsnForTimestamp::Present(lsn) => format!("{lsn}"),
-        LsnForTimestamp::Future(_lsn) => "future".into(),
-        LsnForTimestamp::Past(_lsn) => "past".into(),
-        LsnForTimestamp::NoData(_lsn) => "nodata".into(),
-    };
-    json_response(StatusCode::OK, result)
+    if version.unwrap_or(0) > 1 {
+        #[serde_as]
+        #[derive(serde::Serialize)]
+        struct Result {
+            #[serde_as(as = "DisplayFromStr")]
+            lsn: Lsn,
+            kind: &'static str,
+        }
+        let (lsn, kind) = match result {
+            LsnForTimestamp::Present(lsn) => (lsn, "present"),
+            LsnForTimestamp::Future(lsn) => (lsn, "future"),
+            LsnForTimestamp::Past(lsn) => (lsn, "past"),
+            LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
+        };
+        json_response(StatusCode::OK, Result { lsn, kind })
+    } else {
+        // FIXME: this is a temporary crutch not to break backwards compatibility
+        // See https://github.com/neondatabase/neon/pull/5608
+        let result = match result {
+            LsnForTimestamp::Present(lsn) => format!("{lsn}"),
+            LsnForTimestamp::Future(_lsn) => "future".into(),
+            LsnForTimestamp::Past(_lsn) => "past".into(),
+            LsnForTimestamp::NoData(_lsn) => "nodata".into(),
+        };
+        json_response(StatusCode::OK, result)
+    }
+}
+
+async fn get_timestamp_of_lsn_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+
+    let lsn_str = must_get_query_param(&request, "lsn")?;
+    let lsn = Lsn::from_str(&lsn_str)
+        .with_context(|| format!("Invalid LSN: {lsn_str:?}"))
+        .map_err(ApiError::BadRequest)?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+    let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;
+
+    match result {
+        Some(time) => {
+            let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
+            json_response(StatusCode::OK, time)
+        }
+        None => json_response(StatusCode::NOT_FOUND, ()),
+    }
 }

 async fn tenant_attach_handler(
@@ -740,6 +789,10 @@ async fn tenant_size_handler(
        .map_err(ApiError::InternalServerError)?;

    let mut sizes = None;
+    let accepts_html = headers
+        .get(header::ACCEPT)
+        .map(|v| v == "text/html")
+        .unwrap_or_default();
    if !inputs_only.unwrap_or(false) {
        let storage_model = inputs
            .calculate_model()
@@ -747,11 +800,11 @@ async fn tenant_size_handler(
        let size = storage_model.calculate();

        // If request header expects html, return html
-        if headers["Accept"] == "text/html" {
+        if accepts_html {
            return synthetic_size_html_response(inputs, storage_model, size);
        }
        sizes = Some(size);
-    } else if headers["Accept"] == "text/html" {
+    } else if accepts_html {
        return Err(ApiError::BadRequest(anyhow!(
            "inputs_only parameter is incompatible with html output request"
        )));
@@ -902,7 +955,7 @@ fn synthetic_size_html_response(
 pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>, ApiError> {
    let response = Response::builder()
        .status(status)
-        .header(hyper::header::CONTENT_TYPE, "text/html")
+        .header(header::CONTENT_TYPE, "text/html")
        .body(Body::from(data.as_bytes().to_vec()))
        .map_err(|e| ApiError::InternalServerError(e.into()))?;
    Ok(response)
@@ -1036,9 +1089,17 @@ async fn put_tenant_location_config_handler(
    // The `Detached` state is special, it doesn't upsert a tenant, it removes
    // its local disk content and drops it from memory.
    if let LocationConfigMode::Detached = request_data.config.mode {
-        mgr::detach_tenant(conf, tenant_id, true, &state.deletion_queue_client)
+        if let Err(e) = mgr::detach_tenant(conf, tenant_id, true, &state.deletion_queue_client)
            .instrument(info_span!("tenant_detach", %tenant_id))
-            .await?;
+            .await
+        {
+            match e {
+                TenantStateError::NotFound(_) => {
+                    // This API is idempotent: a NotFound on a detach is fine.
+                }
+                _ => return Err(e.into()),
+            }
+        }
        return json_response(StatusCode::OK, ());
    }

@@ -1144,7 +1205,7 @@ async fn timeline_compact_handler(
        timeline
            .compact(&cancel, &ctx)
            .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;
        json_response(StatusCode::OK, ())
    }
    .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
@@ -1169,7 +1230,7 @@ async fn timeline_checkpoint_handler(
        timeline
            .compact(&cancel, &ctx)
            .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;

        json_response(StatusCode::OK, ())
    }
@@ -1275,7 +1336,7 @@ async fn getpage_at_lsn_handler(
        Result::<_, ApiError>::Ok(
            Response::builder()
                .status(StatusCode::OK)
-                .header(CONTENT_TYPE, "application/octet-stream")
+                .header(header::CONTENT_TYPE, "application/octet-stream")
                .body(hyper::Body::from(page))
                .unwrap(),
        )
@@ -1439,11 +1500,11 @@ async fn disk_usage_eviction_run(

    let state = get_state(&r);

-    let Some(storage) = state.remote_storage.clone() else {
+    if state.remote_storage.as_ref().is_none() {
        return Err(ApiError::InternalServerError(anyhow::anyhow!(
            "remote storage not configured, cannot run eviction iteration"
        )));
-    };
+    }

    let state = state.disk_usage_eviction_state.clone();

@@ -1461,7 +1522,6 @@ async fn disk_usage_eviction_run(
        async move {
            let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
                &state,
-                &storage,
                usage,
                &child_cancel,
            )
@@ -1674,6 +1734,10 @@ pub fn make_router(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
            |r| api_handler(r, get_lsn_by_timestamp_handler),
        )
+        .get(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/get_timestamp_of_lsn",
+            |r| api_handler(r, get_timestamp_of_lsn_handler),
+        )
        .put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
            api_handler(r, timeline_gc_handler)
        })
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -149,6 +149,10 @@ fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
    }
 }

+// FIXME: DO NOT ADD new query methods like this, which will have a next step of parsing timelineid
+// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
+// from the name.
+
 pub fn is_uninit_mark(path: &Utf8Path) -> bool {
    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
 }
@@ -173,6 +177,9 @@ fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
 /// delaying is needed.
 #[derive(Clone)]
 pub struct InitializationOrder {
+    /// Each initial tenant load task carries this until it is done loading timelines from remote storage
+    pub initial_tenant_load_remote: Option<utils::completion::Completion>,
+
    /// Each initial tenant load task carries this until completion.
    pub initial_tenant_load: Option<utils::completion::Completion>,

--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1067,6 +1067,26 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("Failed to register tenant_task_events metric")
 });

+pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT: Lazy<IntCounterVec> =
+    Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_background_loop_semaphore_wait_start_count",
+            "Counter for background loop concurrency-limiting semaphore acquire calls started",
+            &["task"],
+        )
+        .unwrap()
+    });
+
+pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT: Lazy<IntCounterVec> =
+    Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_background_loop_semaphore_wait_finish_count",
+            "Counter for background loop concurrency-limiting semaphore acquire calls finished",
+            &["task"],
+        )
+        .unwrap()
+    });
+
 pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_background_loop_period_overrun_count",
@@ -1368,28 +1388,23 @@ impl TimelineMetrics {
        }
    }

-    pub fn record_new_file_metrics(&self, sz: u64) {
+    pub(crate) fn record_new_file_metrics(&self, sz: u64) {
        self.resident_physical_size_add(sz);
        self.num_persistent_files_created.inc_by(1);
        self.persistent_bytes_written.inc_by(sz);
    }

-    pub fn resident_physical_size_sub(&self, sz: u64) {
+    pub(crate) fn resident_physical_size_sub(&self, sz: u64) {
        self.resident_physical_size_gauge.sub(sz);
        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
    }

-    pub fn resident_physical_size_add(&self, sz: u64) {
+    pub(crate) fn resident_physical_size_add(&self, sz: u64) {
        self.resident_physical_size_gauge.add(sz);
        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
    }

-    pub fn resident_physical_size_set(&self, sz: u64) {
-        self.resident_physical_size_gauge.set(sz);
-        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.set(sz);
-    }
-
-    pub fn resident_physical_size_get(&self) -> u64 {
+    pub(crate) fn resident_physical_size_get(&self) -> u64 {
        self.resident_physical_size_gauge.get()
    }
 }
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -318,15 +318,6 @@ impl std::ops::Deref for PageWriteGuard<'_> {
    }
 }

-impl AsMut<[u8; PAGE_SZ]> for PageWriteGuard<'_> {
-    fn as_mut(&mut self) -> &mut [u8; PAGE_SZ] {
-        match &mut self.state {
-            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
-            PageWriteGuardState::Downgraded => unreachable!(),
-        }
-    }
-}
-
 impl<'a> PageWriteGuard<'a> {
    /// Mark that the buffer contents are now valid.
    #[must_use]
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -122,6 +122,7 @@ pub async fn libpq_listener_main(
    listener: TcpListener,
    auth_type: AuthType,
    listener_ctx: RequestContext,
+    cancel: CancellationToken,
 ) -> anyhow::Result<()> {
    listener.set_nonblocking(true)?;
    let tokio_listener = tokio::net::TcpListener::from_std(listener)?;
@@ -130,7 +131,7 @@ pub async fn libpq_listener_main(
    while let Some(res) = tokio::select! {
        biased;

-        _ = task_mgr::shutdown_watcher() => {
+        _ = cancel.cancelled() => {
            // We were requested to shut down.
            None
        }
@@ -299,7 +300,7 @@ impl PageServerHandler {
                Ok(flush_r?)
            },
            _ = self.cancel.cancelled() => {
-                Err(QueryError::Other(anyhow::anyhow!("Shutting down")))
+                Err(QueryError::Shutdown)
            }
        )
    }
@@ -316,11 +317,11 @@ impl PageServerHandler {
                let msg = tokio::select! {
                    biased;

-                    _ = task_mgr::shutdown_watcher() => {
+                    _ = self.cancel.cancelled() => {
                        // We were requested to shut down.
                        let msg = "pageserver is shutting down";
                        let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
-                        Err(QueryError::Other(anyhow::anyhow!(msg)))
+                        Err(QueryError::Shutdown)
                    }

                    msg = pgb.read_message() => { msg.map_err(QueryError::from)}
@@ -414,10 +415,10 @@ impl PageServerHandler {
            let msg = tokio::select! {
                biased;

-                _ = task_mgr::shutdown_watcher() => {
+                _ = self.cancel.cancelled() => {
                    // We were requested to shut down.
                    info!("shutdown request received in page handler");
-                    break;
+                    return Err(QueryError::Shutdown)
                }

                msg = pgb.read_message() => { msg }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -19,6 +19,7 @@ use postgres_ffi::BLCKSZ;
 use postgres_ffi::{Oid, TimestampTz, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
+use std::ops::ControlFlow;
 use std::ops::Range;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
@@ -370,7 +371,6 @@ impl Timeline {
        }
    }

-    ///
    /// Subroutine of find_lsn_for_timestamp(). Returns true, if there are any
    /// commits that committed after 'search_timestamp', at LSN 'probe_lsn'.
    ///
@@ -385,6 +385,50 @@ impl Timeline {
        found_larger: &mut bool,
        ctx: &RequestContext,
    ) -> Result<bool, PageReconstructError> {
+        self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
+            if timestamp >= search_timestamp {
+                *found_larger = true;
+                return ControlFlow::Break(true);
+            } else {
+                *found_smaller = true;
+            }
+            ControlFlow::Continue(())
+        })
+        .await
+    }
+
+    /// Obtain the possible timestamp range for the given lsn.
+    ///
+    /// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
+    pub async fn get_timestamp_for_lsn(
+        &self,
+        probe_lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<Option<TimestampTz>, PageReconstructError> {
+        let mut max: Option<TimestampTz> = None;
+        self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
+            if let Some(max_prev) = max {
+                max = Some(max_prev.max(timestamp));
+            } else {
+                max = Some(timestamp);
+            }
+            ControlFlow::Continue(())
+        })
+        .await?;
+
+        Ok(max)
+    }
+
+    /// Runs the given function on all the timestamps for a given lsn
+    ///
+    /// The return value is either given by the closure, or set to the `Default`
+    /// impl's output.
+    async fn map_all_timestamps<T: Default>(
+        &self,
+        probe_lsn: Lsn,
+        ctx: &RequestContext,
+        mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
+    ) -> Result<T, PageReconstructError> {
        for segno in self
            .list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
            .await?
@@ -402,16 +446,14 @@ impl Timeline {
                    timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
                    let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);

-                    if timestamp >= search_timestamp {
-                        *found_larger = true;
-                        return Ok(true);
-                    } else {
-                        *found_smaller = true;
+                    match f(timestamp) {
+                        ControlFlow::Break(b) => return Ok(b),
+                        ControlFlow::Continue(()) => (),
                    }
                }
            }
        }
-        Ok(false)
+        Ok(Default::default())
    }

    /// Get a list of SLRU segments
@@ -499,6 +541,24 @@ impl Timeline {
        self.get(CHECKPOINT_KEY, lsn, ctx).await
    }

+    pub async fn list_aux_files(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
+        match self.get(AUX_FILES_KEY, lsn, ctx).await {
+            Ok(buf) => match AuxFilesDirectory::des(&buf).context("deserialization failure") {
+                Ok(dir) => Ok(dir.files),
+                Err(e) => Err(PageReconstructError::from(e)),
+            },
+            Err(e) => {
+                // This is expected: historical databases do not have the key.
+                debug!("Failed to get info about AUX files: {}", e);
+                Ok(HashMap::new())
+            }
+        }
+    }
+
    /// Does the same as get_current_logical_size but counted on demand.
    /// Used to initialize the logical size tracking on startup.
    ///
@@ -616,7 +676,9 @@ impl Timeline {

        result.add_key(CONTROLFILE_KEY);
        result.add_key(CHECKPOINT_KEY);
-
+        if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
+            result.add_key(AUX_FILES_KEY);
+        }
        Ok(result.to_keyspace())
    }

@@ -692,6 +754,12 @@ impl<'a> DatadirModification<'a> {
        })?;
        self.put(DBDIR_KEY, Value::Image(buf.into()));

+        // Create AuxFilesDirectory
+        let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
+            files: HashMap::new(),
+        })?;
+        self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
+
        let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
            xids: HashSet::new(),
        })?;
@@ -796,6 +864,12 @@ impl<'a> DatadirModification<'a> {
            // 'true', now write the updated 'dbdirs' map back.
            let buf = DbDirectory::ser(&dbdir)?;
            self.put(DBDIR_KEY, Value::Image(buf.into()));
+
+            // Create AuxFilesDirectory as well
+            let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
+                files: HashMap::new(),
+            })?;
+            self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
        }
        if r.is_none() {
            // Create RelDirectory
@@ -1120,6 +1194,37 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

+    pub async fn put_file(
+        &mut self,
+        path: &str,
+        content: &[u8],
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        let mut dir = match self.get(AUX_FILES_KEY, ctx).await {
+            Ok(buf) => AuxFilesDirectory::des(&buf)?,
+            Err(e) => {
+                // This is expected: historical databases do not have the key.
+                debug!("Failed to get info about AUX files: {}", e);
+                AuxFilesDirectory {
+                    files: HashMap::new(),
+                }
+            }
+        };
+        let path = path.to_string();
+        if content.is_empty() {
+            dir.files.remove(&path);
+        } else {
+            dir.files.insert(path, Bytes::copy_from_slice(content));
+        }
+        self.put(
+            AUX_FILES_KEY,
+            Value::Image(Bytes::from(
+                AuxFilesDirectory::ser(&dir).context("serialize")?,
+            )),
+        );
+        Ok(())
+    }
+
    ///
    /// Flush changes accumulated so far to the underlying repository.
    ///
@@ -1255,6 +1360,11 @@ struct RelDirectory {
    rels: HashSet<(Oid, u8)>,
 }

+#[derive(Debug, Serialize, Deserialize, Default)]
+struct AuxFilesDirectory {
+    files: HashMap<String, Bytes>,
+}
+
 #[derive(Debug, Serialize, Deserialize)]
 struct RelSizeEntry {
    nblocks: u32,
@@ -1303,10 +1413,12 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 // 02 pg_twophase
 //
 // 03 misc
-//    controlfile
+//    Controlfile
 //    checkpoint
 //    pg_version
 //
+// 04 aux files
+//
 // Below is a full list of the keyspace allocation:
 //
 // DbDir:
@@ -1344,6 +1456,11 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 //
 // Checkpoint:
 // 03 00000000 00000000 00000000 00   00000001
+//
+// AuxFiles:
+// 03 00000000 00000000 00000000 00   00000002
+//
+
 //-- Section 01: relation data and metadata

 const DBDIR_KEY: Key = Key {
@@ -1567,6 +1684,15 @@ const CHECKPOINT_KEY: Key = Key {
    field6: 1,
 };

+const AUX_FILES_KEY: Key = Key {
+    field1: 0x03,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 2,
+};
+
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.

--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -28,8 +28,9 @@ impl Key {
    /// As long as Neon does not support tablespace (because of lack of access to local file system),
    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
    pub fn to_i128(&self) -> i128 {
-        assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
-        (((self.field1 & 0xf) as i128) << 120)
+        assert!(self.field1 < 0xF);
+        assert!(self.field2 < 0xFFFF);
+        (((self.field1 & 0xF) as i128) << 120)
            | (((self.field2 & 0xFFFF) as i128) << 104)
            | ((self.field3 as i128) << 72)
            | ((self.field4 as i128) << 40)
@@ -149,8 +150,8 @@ impl Key {
        field6: u32::MIN,
    };
    pub const MAX: Key = Key {
-        field1: u8::MAX,
-        field2: u32::MAX,
+        field1: 0xF - 1,
+        field2: 0xFFFF - 1,
        field3: u32::MAX,
        field4: u32::MAX,
        field5: u8::MAX,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -3,10 +3,10 @@ use std::sync::Arc;
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::models::TenantState;
-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+use remote_storage::{GenericRemoteStorage, RemotePath};
 use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, instrument, warn, Instrument, Span};
+use tracing::{error, instrument, warn, Instrument, Span};

 use utils::{
    backoff, completion, crashsafe, fs_ext,
@@ -25,11 +25,9 @@ use super::{
    remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
    span,
    timeline::delete::DeleteTimelineFlow,
-    tree_sort_timelines, DeleteTimelineError, Tenant,
+    tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
 };

-const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;
-
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum DeleteTenantError {
    #[error("GetTenant {0}")]
@@ -60,7 +58,7 @@ fn remote_tenant_delete_mark_path(
        .context("Failed to strip workdir prefix")
        .and_then(RemotePath::new)
        .context("tenant path")?;
-    Ok(tenant_remote_path.join(Utf8Path::new("deleted")))
+    Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
 }

 async fn create_remote_delete_mark(
@@ -150,7 +148,8 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del
    // Assert timelines dir is empty.
    if !fs_ext::is_directory_empty(timelines_path).await? {
        // Display first 10 items in directory
-        let list = &fs_ext::list_dir(timelines_path).await.context("list_dir")?[..10];
+        let list = fs_ext::list_dir(timelines_path).await.context("list_dir")?;
+        let list = &list.into_iter().take(10).collect::<Vec<_>>();
        return Err(DeleteTenantError::Other(anyhow::anyhow!(
            "Timelines directory is not empty after all timelines deletion: {list:?}"
        )));
@@ -239,32 +238,6 @@ async fn cleanup_remaining_fs_traces(
    Ok(())
 }

-pub(crate) async fn remote_delete_mark_exists(
-    conf: &PageServerConf,
-    tenant_id: &TenantId,
-    remote_storage: &GenericRemoteStorage,
-) -> anyhow::Result<bool> {
-    // If remote storage is there we rely on it
-    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id).context("path")?;
-
-    let result = backoff::retry(
-        || async { remote_storage.download(&remote_mark_path).await },
-        |e| matches!(e, DownloadError::NotFound),
-        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
-        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
-        "fetch_tenant_deletion_mark",
-        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-        backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
-    )
-    .await;
-
-    match result {
-        Ok(_) => Ok(true),
-        Err(DownloadError::NotFound) => Ok(false),
-        Err(e) => Err(anyhow::anyhow!(e)).context("remote_delete_mark_exists")?,
-    }
-}
-
 /// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
 /// and deletes its data from both disk and s3.
 /// The sequence of steps:
@@ -276,10 +249,9 @@ pub(crate) async fn remote_delete_mark_exists(
 /// 6. Remove remote mark
 /// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
 /// It is resumable from any step in case a crash/restart occurs.
-/// There are three entrypoints to the process:
+/// There are two entrypoints to the process:
 /// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
-/// 2. [`DeleteTenantFlow::resume_from_load`] is called during restarts when local or remote deletion marks are still there.
-/// 3. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
+/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
 ///  Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
 #[derive(Default)]
 pub enum DeleteTenantFlow {
@@ -378,7 +350,7 @@ impl DeleteTenantFlow {

    pub(crate) async fn should_resume_deletion(
        conf: &'static PageServerConf,
-        remote_storage: Option<&GenericRemoteStorage>,
+        remote_mark_exists: bool,
        tenant: &Tenant,
    ) -> Result<Option<DeletionGuard>, DeleteTenantError> {
        let acquire = |t: &Tenant| {
@@ -389,66 +361,25 @@ impl DeleteTenantFlow {
            )
        };

-        let tenant_id = tenant.tenant_id;
-        // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
-        if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
+        if remote_mark_exists {
            return Ok(acquire(tenant));
        }

-        let remote_storage = match remote_storage {
-            Some(remote_storage) => remote_storage,
-            None => return Ok(None),
-        };
-
-        if remote_delete_mark_exists(conf, &tenant_id, remote_storage).await? {
+        let tenant_id = tenant.tenant_id;
+        // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
+        if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
            Ok(acquire(tenant))
        } else {
            Ok(None)
        }
    }

-    pub(crate) async fn resume_from_load(
-        guard: DeletionGuard,
-        tenant: &Arc<Tenant>,
-        init_order: Option<&InitializationOrder>,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
-        ctx: &RequestContext,
-    ) -> Result<(), DeleteTenantError> {
-        let (_, progress) = completion::channel();
-
-        tenant
-            .set_stopping(progress, true, false)
-            .await
-            .expect("cant be stopping or broken");
-
-        // Do not consume valuable resources during the load phase, continue deletion once init phase is complete.
-        let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
-        if let Some(background) = background_jobs_can_start {
-            info!("waiting for backgound jobs barrier");
-            background.clone().wait().await;
-            info!("ready for backgound jobs barrier");
-        }
-
-        // Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir)
-        let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id);
-        if timelines_path.exists() {
-            tenant.load(init_order, ctx).await.context("load")?;
-        }
-
-        Self::background(
-            guard,
-            tenant.conf,
-            tenant.remote_storage.clone(),
-            tenants,
-            tenant,
-        )
-        .await
-    }
-
    pub(crate) async fn resume_from_attach(
        guard: DeletionGuard,
        tenant: &Arc<Tenant>,
+        preload: Option<TenantPreload>,
        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        init_order: Option<InitializationOrder>,
        ctx: &RequestContext,
    ) -> Result<(), DeleteTenantError> {
        let (_, progress) = completion::channel();
@@ -458,7 +389,10 @@ impl DeleteTenantFlow {
            .await
            .expect("cant be stopping or broken");

-        tenant.attach(ctx).await.context("attach")?;
+        tenant
+            .attach(init_order, preload, ctx)
+            .await
+            .context("attach")?;

        Self::background(
            guard,
--- a/pageserver/src/tenant/disk_btree_test_data.rs
+++ b/pageserver/src/tenant/disk_btree_test_data.rs
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -354,8 +354,7 @@ mod tests {
        }

        // Test a large blob that spans multiple pages
-        let mut large_data = Vec::new();
-        large_data.resize(20000, 0);
+        let mut large_data = vec![0; 20000];
        thread_rng().fill_bytes(&mut large_data);
        let pos_large = file.write_blob(&large_data, &ctx).await?;
        let result = file.block_cursor().read_blob(pos_large, &ctx).await?;
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -639,147 +639,10 @@ impl LayerMap {
        }

        println!("historic_layers:");
-        for layer in self.iter_historic_layers() {
-            layer.dump(verbose, ctx)?;
+        for desc in self.iter_historic_layers() {
+            desc.dump();
        }
        println!("End dump LayerMap");
        Ok(())
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use super::LayerMap;
-    use crate::tenant::storage_layer::LayerFileName;
-    use std::str::FromStr;
-    use std::sync::Arc;
-
-    mod l0_delta_layers_updated {
-
-        use crate::tenant::{
-            storage_layer::{AsLayerDesc, PersistentLayerDesc},
-            timeline::layer_manager::LayerFileManager,
-        };
-
-        use super::*;
-
-        struct LayerObject(PersistentLayerDesc);
-
-        impl AsLayerDesc for LayerObject {
-            fn layer_desc(&self) -> &PersistentLayerDesc {
-                &self.0
-            }
-        }
-
-        impl LayerObject {
-            fn new(desc: PersistentLayerDesc) -> Self {
-                LayerObject(desc)
-            }
-        }
-
-        type TestLayerFileManager = LayerFileManager<LayerObject>;
-
-        #[test]
-        fn for_full_range_delta() {
-            // l0_delta_layers are used by compaction, and should observe all buffered updates
-            l0_delta_layers_updated_scenario(
-                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
-                 true
-             )
-        }
-
-        #[test]
-        fn for_non_full_range_delta() {
-            // has minimal uncovered areas compared to l0_delta_layers_updated_on_insert_replace_remove_for_full_range_delta
-            l0_delta_layers_updated_scenario(
-                 "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
-                 // because not full range
-                 false
-             )
-        }
-
-        #[test]
-        fn for_image() {
-            l0_delta_layers_updated_scenario(
-                 "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
-                 // code only checks if it is a full range layer, doesn't care about images, which must
-                 // mean we should in practice never have full range images
-                 false
-             )
-        }
-
-        #[test]
-        fn replacing_missing_l0_is_notfound() {
-            // original impl had an oversight, and L0 was an anyhow::Error. anyhow::Error should
-            // however only happen for precondition failures.
-
-            let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
-            let layer = LayerFileName::from_str(layer).unwrap();
-            let layer = PersistentLayerDesc::from(layer);
-
-            // same skeletan construction; see scenario below
-            let not_found = Arc::new(LayerObject::new(layer.clone()));
-            let new_version = Arc::new(LayerObject::new(layer));
-
-            // after the immutable storage state refactor, the replace operation
-            // will not use layer map any more. We keep it here for consistency in test cases
-            // and can remove it in the future.
-            let _map = LayerMap::default();
-
-            let mut mapping = TestLayerFileManager::new();
-
-            mapping
-                .replace_and_verify(not_found, new_version)
-                .unwrap_err();
-        }
-
-        fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
-            let name = LayerFileName::from_str(layer_name).unwrap();
-            let skeleton = PersistentLayerDesc::from(name);
-
-            let remote = Arc::new(LayerObject::new(skeleton.clone()));
-            let downloaded = Arc::new(LayerObject::new(skeleton));
-
-            let mut map = LayerMap::default();
-            let mut mapping = LayerFileManager::new();
-
-            // two disjoint Arcs in different lifecycle phases. even if it seems they must be the
-            // same layer, we use LayerMap::compare_arced_layers as the identity of layers.
-            assert_eq!(remote.layer_desc(), downloaded.layer_desc());
-
-            let expected_in_counts = (1, usize::from(expected_l0));
-
-            map.batch_update()
-                .insert_historic(remote.layer_desc().clone());
-            mapping.insert(remote.clone());
-            assert_eq!(
-                count_layer_in(&map, remote.layer_desc()),
-                expected_in_counts
-            );
-
-            mapping
-                .replace_and_verify(remote, downloaded.clone())
-                .expect("name derived attributes are the same");
-            assert_eq!(
-                count_layer_in(&map, downloaded.layer_desc()),
-                expected_in_counts
-            );
-
-            map.batch_update().remove_historic(downloaded.layer_desc());
-            assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
-        }
-
-        fn count_layer_in(map: &LayerMap, layer: &PersistentLayerDesc) -> (usize, usize) {
-            let historic = map
-                .iter_historic_layers()
-                .filter(|x| x.key() == layer.key())
-                .count();
-            let l0s = map
-                .get_level0_deltas()
-                .expect("why does this return a result");
-            let l0 = l0s.iter().filter(|x| x.key() == layer.key()).count();
-
-            (historic, l0)
-        }
-    }
-}
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1,7 +1,7 @@
 //! This module acts as a switchboard to access different repositories managed by this
 //! page server.

-use camino::{Utf8Path, Utf8PathBuf};
+use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
 use rand::{distributions::Alphanumeric, Rng};
 use std::collections::{hash_map, HashMap};
 use std::sync::Arc;
@@ -26,9 +26,7 @@ use crate::deletion_queue::DeletionQueueClient;
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt};
 use crate::tenant::delete::DeleteTenantFlow;
-use crate::tenant::{
-    create_tenant_files, AttachedTenantConf, CreateTenantFilesMode, Tenant, TenantState,
-};
+use crate::tenant::{create_tenant_files, AttachedTenantConf, SpawnMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};

 use utils::crashsafe::path_with_suffix_extension;
@@ -151,6 +149,49 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U

 static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::Initializing));

+/// Create a directory, including parents.  This does no fsyncs and makes
+/// no guarantees about the persistence of the resulting metadata: for
+/// use when creating dirs for use as cache.
+async fn unsafe_create_dir_all(path: &Utf8PathBuf) -> std::io::Result<()> {
+    let mut dirs_to_create = Vec::new();
+    let mut path: &Utf8Path = path.as_ref();
+
+    // Figure out which directories we need to create.
+    loop {
+        let meta = tokio::fs::metadata(path).await;
+        match meta {
+            Ok(metadata) if metadata.is_dir() => break,
+            Ok(_) => {
+                return Err(std::io::Error::new(
+                    std::io::ErrorKind::AlreadyExists,
+                    format!("non-directory found in path: {path}"),
+                ));
+            }
+            Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => {}
+            Err(e) => return Err(e),
+        }
+
+        dirs_to_create.push(path);
+
+        match path.parent() {
+            Some(parent) => path = parent,
+            None => {
+                return Err(std::io::Error::new(
+                    std::io::ErrorKind::InvalidInput,
+                    format!("can't find parent of path '{path}'"),
+                ));
+            }
+        }
+    }
+
+    // Create directories from parent to child.
+    for &path in dirs_to_create.iter().rev() {
+        tokio::fs::create_dir(path).await?;
+    }
+
+    Ok(())
+}
+
 fn emergency_generations(
    tenant_confs: &HashMap<TenantId, anyhow::Result<LocationConf>>,
 ) -> HashMap<TenantId, Generation> {
@@ -212,83 +253,99 @@ async fn init_load_generations(
    Ok(Some(generations))
 }

+/// Given a directory discovered in the pageserver's tenants/ directory, attempt
+/// to load a tenant config from it.
+///
+/// If file is missing, return Ok(None)
+fn load_tenant_config(
+    conf: &'static PageServerConf,
+    dentry: Utf8DirEntry,
+) -> anyhow::Result<Option<(TenantId, anyhow::Result<LocationConf>)>> {
+    let tenant_dir_path = dentry.path().to_path_buf();
+    if crate::is_temporary(&tenant_dir_path) {
+        info!("Found temporary tenant directory, removing: {tenant_dir_path}");
+        // No need to use safe_remove_tenant_dir_all because this is already
+        // a temporary path
+        if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) {
+            error!(
+                "Failed to remove temporary directory '{}': {:?}",
+                tenant_dir_path, e
+            );
+        }
+        return Ok(None);
+    }
+
+    // This case happens if we crash during attachment before writing a config into the dir
+    let is_empty = tenant_dir_path
+        .is_empty_dir()
+        .with_context(|| format!("Failed to check whether {tenant_dir_path:?} is an empty dir"))?;
+    if is_empty {
+        info!("removing empty tenant directory {tenant_dir_path:?}");
+        if let Err(e) = std::fs::remove_dir(&tenant_dir_path) {
+            error!(
+                "Failed to remove empty tenant directory '{}': {e:#}",
+                tenant_dir_path
+            )
+        }
+        return Ok(None);
+    }
+
+    let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
+    if tenant_ignore_mark_file.exists() {
+        info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
+        return Ok(None);
+    }
+
+    let tenant_id = match tenant_dir_path
+        .file_name()
+        .unwrap_or_default()
+        .parse::<TenantId>()
+    {
+        Ok(id) => id,
+        Err(_) => {
+            warn!("Invalid tenant path (garbage in our repo directory?): {tenant_dir_path}",);
+            return Ok(None);
+        }
+    };
+
+    Ok(Some((
+        tenant_id,
+        Tenant::load_tenant_config(conf, &tenant_id),
+    )))
+}
+
 /// Initial stage of load: walk the local tenants directory, clean up any temp files,
 /// and load configurations for the tenants we found.
+///
+/// Do this in parallel, because we expect 10k+ tenants, so serial execution can take
+/// seconds even on reasonably fast drives.
 async fn init_load_tenant_configs(
    conf: &'static PageServerConf,
 ) -> anyhow::Result<HashMap<TenantId, anyhow::Result<LocationConf>>> {
    let tenants_dir = conf.tenants_path();

-    let mut dir_entries = tenants_dir
-        .read_dir_utf8()
-        .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
+    let dentries = tokio::task::spawn_blocking(move || -> anyhow::Result<Vec<Utf8DirEntry>> {
+        let dir_entries = tenants_dir
+            .read_dir_utf8()
+            .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
+
+        Ok(dir_entries.collect::<Result<Vec<_>, std::io::Error>>()?)
+    })
+    .await??;

    let mut configs = HashMap::new();

-    loop {
-        match dir_entries.next() {
-            None => break,
-            Some(Ok(dentry)) => {
-                let tenant_dir_path = dentry.path().to_path_buf();
-                if crate::is_temporary(&tenant_dir_path) {
-                    info!("Found temporary tenant directory, removing: {tenant_dir_path}");
-                    // No need to use safe_remove_tenant_dir_all because this is already
-                    // a temporary path
-                    if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await {
-                        error!(
-                            "Failed to remove temporary directory '{}': {:?}",
-                            tenant_dir_path, e
-                        );
-                    }
-                    continue;
-                }
+    let mut join_set = JoinSet::new();
+    for dentry in dentries {
+        join_set.spawn_blocking(move || load_tenant_config(conf, dentry));
+    }

-                // This case happens if we:
-                // * crash during attach before creating the attach marker file
-                // * crash during tenant delete before removing tenant directory
-                let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
-                    format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
-                })?;
-                if is_empty {
-                    info!("removing empty tenant directory {tenant_dir_path:?}");
-                    if let Err(e) = fs::remove_dir(&tenant_dir_path).await {
-                        error!(
-                            "Failed to remove empty tenant directory '{}': {e:#}",
-                            tenant_dir_path
-                        )
-                    }
-                    continue;
-                }
-
-                let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
-                if tenant_ignore_mark_file.exists() {
-                    info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
-                    continue;
-                }
-
-                let tenant_id = match tenant_dir_path
-                    .file_name()
-                    .unwrap_or_default()
-                    .parse::<TenantId>()
-                {
-                    Ok(id) => id,
-                    Err(_) => {
-                        warn!(
-                            "Invalid tenant path (garbage in our repo directory?): {tenant_dir_path}",
-                        );
-                        continue;
-                    }
-                };
-
-                configs.insert(tenant_id, Tenant::load_tenant_config(conf, &tenant_id));
-            }
-            Some(Err(e)) => {
-                // An error listing the top level directory indicates serious problem
-                // with local filesystem: we will fail to load, and fail to start.
-                anyhow::bail!(e);
-            }
+    while let Some(r) = join_set.join_next().await {
+        if let Some((tenant_id, tenant_config)) = r?? {
+            configs.insert(tenant_id, tenant_config);
        }
    }
+
    Ok(configs)
 }

@@ -377,14 +434,15 @@ pub async fn init_tenant_mgr(
        location_conf.attach_in_generation(generation);
        Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;

-        match schedule_local_tenant_processing(
+        match tenant_spawn(
            conf,
            tenant_id,
            &tenant_dir_path,
-            AttachedTenantConf::try_from(location_conf)?,
            resources.clone(),
+            AttachedTenantConf::try_from(location_conf)?,
            Some(init_order.clone()),
            &TENANTS,
+            SpawnMode::Normal,
            &ctx,
        ) {
            Ok(tenant) => {
@@ -404,15 +462,18 @@ pub async fn init_tenant_mgr(
    Ok(())
 }

+/// Wrapper for Tenant::spawn that checks invariants before running, and inserts
+/// a broken tenant in the map if Tenant::spawn fails.
 #[allow(clippy::too_many_arguments)]
-pub(crate) fn schedule_local_tenant_processing(
+pub(crate) fn tenant_spawn(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
    tenant_path: &Utf8Path,
-    location_conf: AttachedTenantConf,
    resources: TenantSharedResources,
+    location_conf: AttachedTenantConf,
    init_order: Option<InitializationOrder>,
    tenants: &'static tokio::sync::RwLock<TenantsMap>,
+    mode: SpawnMode,
    ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
    anyhow::ensure!(
@@ -436,37 +497,24 @@ pub(crate) fn schedule_local_tenant_processing(
        "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
    );

-    let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
-        info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
-        if resources.remote_storage.is_none() {
-            warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
-            Tenant::create_broken_tenant(
-                conf,
-                tenant_id,
-                "attaching mark file present but no remote storage configured".to_string(),
-            )
-        } else {
-            match Tenant::spawn_attach(conf, tenant_id, resources, location_conf, tenants, ctx) {
-                Ok(tenant) => tenant,
-                Err(e) => {
-                    error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
-                    Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
-                }
-            }
+    info!("Attaching tenant {tenant_id}");
+    let tenant = match Tenant::spawn(
+        conf,
+        tenant_id,
+        resources,
+        location_conf,
+        init_order,
+        tenants,
+        mode,
+        ctx,
+    ) {
+        Ok(tenant) => tenant,
+        Err(e) => {
+            error!("Failed to spawn tenant {tenant_id}, reason: {e:#}");
+            Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
        }
-    } else {
-        info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
-        // Start loading the tenant into memory. It will initially be in Loading state.
-        Tenant::spawn_load(
-            conf,
-            tenant_id,
-            location_conf,
-            resources,
-            init_order,
-            tenants,
-            ctx,
-        )
    };
+
    Ok(tenant)
 }

@@ -602,29 +650,41 @@ pub(crate) async fn create_tenant(
    ctx: &RequestContext,
 ) -> Result<Arc<Tenant>, TenantMapInsertError> {
    tenant_map_insert(tenant_id, || async {
-
        let location_conf = LocationConf::attached_single(tenant_conf, generation);

        // We're holding the tenants lock in write mode while doing local IO.
        // If this section ever becomes contentious, introduce a new `TenantState::Creating`
        // and do the work in that state.
-        let tenant_directory = super::create_tenant_files(conf, &location_conf, &tenant_id, CreateTenantFilesMode::Create).await?;
+        super::create_tenant_files(conf, &location_conf, &tenant_id).await?;
+
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233

-        let created_tenant =
-            schedule_local_tenant_processing(conf, tenant_id, &tenant_directory,
-                AttachedTenantConf::try_from(location_conf)?, resources, None, &TENANTS, ctx)?;
+        let tenant_path = conf.tenant_path(&tenant_id);
+
+        let created_tenant = tenant_spawn(
+            conf,
+            tenant_id,
+            &tenant_path,
+            resources,
+            AttachedTenantConf::try_from(location_conf)?,
+            None,
+            &TENANTS,
+            SpawnMode::Create,
+            ctx,
+        )?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

        let crated_tenant_id = created_tenant.tenant_id();
        anyhow::ensure!(
-                tenant_id == crated_tenant_id,
-                "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {crated_tenant_id})",
-            );
+            tenant_id == crated_tenant_id,
+            "loaded created tenant has unexpected tenant id \
+                (expect {tenant_id} != actual {crated_tenant_id})",
+        );
        Ok(created_tenant)
-    }).await
+    })
+    .await
 }

 #[derive(Debug, thiserror::Error)]
@@ -655,7 +715,7 @@ pub(crate) async fn set_new_tenant_config(
    Ok(())
 }

-#[instrument(skip_all, fields(tenant_id, new_location_config))]
+#[instrument(skip_all, fields(%tenant_id))]
 pub(crate) async fn upsert_location(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
@@ -733,37 +793,56 @@ pub(crate) async fn upsert_location(
                }
            }

+            let tenant_path = conf.tenant_path(&tenant_id);
+
            let new_slot = match &new_location_config.mode {
-                LocationMode::Secondary(_) => TenantSlot::Secondary,
-                LocationMode::Attached(_attach_config) => {
-                    // Do a schedule_local_tenant_processing
-                    // FIXME: should avoid doing this disk I/O inside the TenantsMap lock,
-                    // we have the same problem in load_tenant/attach_tenant.  Probably
-                    // need a lock in TenantSlot to fix this.
+                LocationMode::Secondary(_) => {
+                    // Directory doesn't need to be fsync'd because if we crash it can
+                    // safely be recreated next time this tenant location is configured.
+                    unsafe_create_dir_all(&tenant_path)
+                        .await
+                        .with_context(|| format!("Creating {tenant_path}"))?;
+
                    Tenant::persist_tenant_config(conf, &tenant_id, &new_location_config)
                        .await
                        .map_err(SetNewTenantConfigError::Persist)?;
-                    let tenant_path = conf.tenant_path(&tenant_id);
-                    let resources = TenantSharedResources {
-                        broker_client,
-                        remote_storage,
-                        deletion_queue_client,
-                    };
-                    let new_tenant = schedule_local_tenant_processing(
+
+                    TenantSlot::Secondary
+                }
+                LocationMode::Attached(_attach_config) => {
+                    // FIXME: should avoid doing this disk I/O inside the TenantsMap lock,
+                    // we have the same problem in load_tenant/attach_tenant.  Probably
+                    // need a lock in TenantSlot to fix this.
+                    let timelines_path = conf.timelines_path(&tenant_id);
+
+                    // Directory doesn't need to be fsync'd because we do not depend on
+                    // it to exist after crashes: it may be recreated when tenant is
+                    // re-attached, see https://github.com/neondatabase/neon/issues/5550
+                    unsafe_create_dir_all(&timelines_path)
+                        .await
+                        .with_context(|| format!("Creating {timelines_path}"))?;
+
+                    Tenant::persist_tenant_config(conf, &tenant_id, &new_location_config)
+                        .await
+                        .map_err(SetNewTenantConfigError::Persist)?;
+
+                    let tenant = tenant_spawn(
                        conf,
                        tenant_id,
                        &tenant_path,
+                        TenantSharedResources {
+                            broker_client,
+                            remote_storage,
+                            deletion_queue_client,
+                        },
                        AttachedTenantConf::try_from(new_location_config)?,
-                        resources,
                        None,
                        &TENANTS,
+                        SpawnMode::Normal,
                        ctx,
-                    )
-                    .with_context(|| {
-                        format!("Failed to schedule tenant processing in path {tenant_path:?}")
-                    })?;
+                    )?;

-                    TenantSlot::Attached(new_tenant)
+                    TenantSlot::Attached(tenant)
                }
            };

@@ -771,7 +850,6 @@ pub(crate) async fn upsert_location(
        })
        .await?;
    }
-
    Ok(())
 }

@@ -951,7 +1029,7 @@ pub(crate) async fn load_tenant(
        location_conf.attach_in_generation(generation);
        Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;

-        let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, AttachedTenantConf::try_from(location_conf)?, resources, None,  &TENANTS, ctx)
+        let new_tenant = tenant_spawn(conf, tenant_id, &tenant_path, resources, AttachedTenantConf::try_from(location_conf)?, None,  &TENANTS, SpawnMode::Normal, ctx)
            .with_context(|| {
                format!("Failed to schedule tenant processing in path {tenant_path:?}")
            })?;
@@ -1025,18 +1103,12 @@ pub(crate) async fn attach_tenant(
 ) -> Result<(), TenantMapInsertError> {
    tenant_map_insert(tenant_id, || async {
        let location_conf = LocationConf::attached_single(tenant_conf, generation);
-        let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_id, CreateTenantFilesMode::Attach).await?;
+        let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_id).await?;
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233

-        // Without the attach marker, schedule_local_tenant_processing will treat the attached tenant as fully attached
-        let marker_file_exists = conf
-            .tenant_attaching_mark_file_path(&tenant_id)
-            .try_exists()
-            .context("check for attach marker file existence")?;
-        anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
-
-        let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, AttachedTenantConf::try_from(location_conf)?, resources, None, &TENANTS, ctx)?;
+        let attached_tenant = tenant_spawn(conf, tenant_id, &tenant_dir,
+            resources, AttachedTenantConf::try_from(location_conf)?, None, &TENANTS, SpawnMode::Normal, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

--- a/pageserver/src/tenant/par_fsync.rs
+++ b/pageserver/src/tenant/par_fsync.rs
@@ -57,8 +57,7 @@ pub fn par_fsync(paths: &[Utf8PathBuf]) -> io::Result<()> {
    fsync_in_thread_pool(paths)
 }

-/// Parallel fsync asynchronously. If number of files are less than PARALLEL_PATH_THRESHOLD, fsync is done in the current
-/// execution thread. Otherwise, we will spawn_blocking and run it in tokio.
+/// Parallel fsync asynchronously.
 pub async fn par_fsync_async(paths: &[Utf8PathBuf]) -> io::Result<()> {
    const MAX_CONCURRENT_FSYNC: usize = 64;
    let mut next = paths.iter().peekable();
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -167,39 +167,15 @@
 //!   - download their remote [`IndexPart`]s
 //!   - create `Timeline` struct and a `RemoteTimelineClient`
 //!   - initialize the client's upload queue with its `IndexPart`
-//!   - create [`RemoteLayer`](super::storage_layer::RemoteLayer) instances
-//!     for layers that are referenced by `IndexPart` but not present locally
 //!   - schedule uploads for layers that are only present locally.
-//!   - if the remote `IndexPart`'s metadata was newer than the metadata in
-//!     the local filesystem, write the remote metadata to the local filesystem
 //! - After the above is done for each timeline, open the tenant for business by
 //!   transitioning it from `TenantState::Attaching` to `TenantState::Active` state.
 //!   This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops.
 //!
-//! We keep track of the fact that a client is in `Attaching` state in a marker
-//! file on the local disk. This is critical because, when we restart the pageserver,
-//! we do not want to do the `List timelines` step for each tenant that has already
-//! been successfully attached (for performance & cost reasons).
-//! Instead, for a tenant without the attach marker file, we assume that the
-//! local state is in sync or ahead of the remote state. This includes the list
-//! of all of the tenant's timelines, which is particularly critical to be up-to-date:
-//! if there's a timeline on the remote that the pageserver doesn't know about,
-//! the GC will not consider its branch point, leading to data loss.
-//! So, for a tenant with the attach marker file, we know that we do not yet have
-//! persisted all the remote timeline's metadata files locally. To exclude the
-//! risk above, we re-run the procedure for such tenants
-//!
 //! # Operating Without Remote Storage
 //!
 //! If no remote storage configuration is provided, the [`RemoteTimelineClient`] is
 //! not created and the uploads are skipped.
-//! Theoretically, it should be ok to remove and re-add remote storage configuration to
-//! the pageserver config at any time, since it doesn't make a difference to
-//! [`Timeline::load_layer_map`].
-//! Of course, the remote timeline dir must not change while we have de-configured
-//! remote storage, i.e., the pageserver must remain the owner of the given prefix
-//! in remote storage.
-//! But note that we don't test any of this right now.
 //!
 //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
 //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
@@ -211,8 +187,7 @@ mod upload;
 use anyhow::Context;
 use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};
-// re-export these
-pub use download::{is_temp_download_file, list_remote_timelines};
+
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use utils::backoff::{
@@ -237,7 +212,7 @@ use crate::metrics::{
 };
 use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::storage_layer::AsLayerDesc;
 use crate::tenant::upload_queue::Delete;
 use crate::tenant::TIMELINES_SEGMENT_NAME;
 use crate::{
@@ -255,10 +230,13 @@ use utils::id::{TenantId, TimelineId};

 use self::index::IndexPart;

-use super::storage_layer::LayerFileName;
+use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
 use super::upload_queue::SetDeletedFlagProgress;
 use super::Generation;

+pub(crate) use download::{is_temp_download_file, list_remote_timelines};
+pub(crate) use index::LayerFileMetadata;
+
 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a download fails, we log it at info-level, and retry.
 // But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
@@ -468,7 +446,10 @@ impl RemoteTimelineClient {
    //

    /// Download index file
-    pub async fn download_index_file(&self) -> Result<MaybeDeletedIndexPart, DownloadError> {
+    pub async fn download_index_file(
+        &self,
+        cancel: CancellationToken,
+    ) -> Result<MaybeDeletedIndexPart, DownloadError> {
        let _unfinished_gauge_guard = self.metrics.call_begin(
            &RemoteOpFileKind::Index,
            &RemoteOpKind::Download,
@@ -482,6 +463,7 @@ impl RemoteTimelineClient {
            &self.tenant_id,
            &self.timeline_id,
            self.generation,
+            cancel,
        )
        .measure_remote_op(
            self.tenant_id,
@@ -627,101 +609,203 @@ impl RemoteTimelineClient {
    ///
    /// Launch an upload operation in the background.
    ///
-    pub fn schedule_layer_file_upload(
+    pub(crate) fn schedule_layer_file_upload(
        self: &Arc<Self>,
-        layer_file_name: &LayerFileName,
-        layer_metadata: &LayerFileMetadata,
+        layer: ResidentLayer,
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

-        upload_queue
-            .latest_files
-            .insert(layer_file_name.clone(), layer_metadata.clone());
-        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
-
-        let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
-        self.calls_unfinished_metric_begin(&op);
-        upload_queue.queued_operations.push_back(op);
-
-        info!("scheduled layer file upload {layer_file_name}");
-
-        // Launch the task immediately, if possible
+        self.schedule_layer_file_upload0(upload_queue, layer);
        self.launch_queued_tasks(upload_queue);
        Ok(())
    }

+    fn schedule_layer_file_upload0(
+        self: &Arc<Self>,
+        upload_queue: &mut UploadQueueInitialized,
+        layer: ResidentLayer,
+    ) {
+        let metadata = layer.metadata();
+
+        upload_queue
+            .latest_files
+            .insert(layer.layer_desc().filename(), metadata.clone());
+        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
+
+        info!("scheduled layer file upload {layer}");
+        let op = UploadOp::UploadLayer(layer, metadata);
+        self.calls_unfinished_metric_begin(&op);
+        upload_queue.queued_operations.push_back(op);
+    }
+
    /// Launch a delete operation in the background.
    ///
-    /// The operation does not modify local state but assumes the local files have already been
-    /// deleted, and is used to mirror those changes to remote.
+    /// The operation does not modify local filesystem state.
    ///
    /// Note: This schedules an index file upload before the deletions.  The
-    /// deletion won't actually be performed, until any previously scheduled
+    /// deletion won't actually be performed, until all previously scheduled
    /// upload operations, and the index file upload, have completed
    /// successfully.
    pub fn schedule_layer_file_deletion(
        self: &Arc<Self>,
-        names: Vec<LayerFileName>,
+        names: &[LayerFileName],
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

+        let with_generations =
+            self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());
+
+        self.schedule_deletion_of_unlinked0(upload_queue, with_generations);
+
+        // Launch the tasks immediately, if possible
+        self.launch_queued_tasks(upload_queue);
+        Ok(())
+    }
+
+    /// Unlinks the layer files from `index_part.json` but does not yet schedule deletion for the
+    /// layer files, leaving them dangling.
+    ///
+    /// The files will be leaked in remote storage unless [`Self::schedule_deletion_of_unlinked`]
+    /// is invoked on them.
+    pub(crate) fn schedule_gc_update(self: &Arc<Self>, gc_layers: &[Layer]) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+
+        // just forget the return value; after uploading the next index_part.json, we can consider
+        // the layer files as "dangling". this is fine, at worst case we create work for the
+        // scrubber.
+
+        let names = gc_layers.iter().map(|x| x.layer_desc().filename());
+
+        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
+
+        self.launch_queued_tasks(upload_queue);
+
+        Ok(())
+    }
+
+    /// Update the remote index file, removing the to-be-deleted files from the index,
+    /// allowing scheduling of actual deletions later.
+    fn schedule_unlinking_of_layers_from_index_part0<I>(
+        self: &Arc<Self>,
+        upload_queue: &mut UploadQueueInitialized,
+        names: I,
+    ) -> Vec<(LayerFileName, Generation)>
+    where
+        I: IntoIterator<Item = LayerFileName>,
+    {
        // Deleting layers doesn't affect the values stored in TimelineMetadata,
        // so we don't need update it. Just serialize it.
        let metadata = upload_queue.latest_metadata.clone();

-        // Update the remote index file, removing the to-be-deleted files from the index,
-        // before deleting the actual files.
-        //
-        // Once we start removing files from upload_queue.latest_files, there's
-        // no going back! Otherwise, some of the files would already be removed
-        // from latest_files, but not yet scheduled for deletion. Use a closure
-        // to syntactically forbid ? or bail! calls here.
-        let no_bail_here = || {
-            // Decorate our list of names with each name's generation, dropping
-            // makes that are unexpectedly missing from our metadata.
-            let with_generations: Vec<_> = names
-                .into_iter()
-                .filter_map(|name| {
-                    // Remove from latest_files, learning the file's remote generation in the process
-                    let meta = upload_queue.latest_files.remove(&name);
+        // Decorate our list of names with each name's generation, dropping
+        // names that are unexpectedly missing from our metadata.
+        let with_generations: Vec<_> = names
+            .into_iter()
+            .filter_map(|name| {
+                let meta = upload_queue.latest_files.remove(&name);

-                    if let Some(meta) = meta {
-                        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
-                        Some((name, meta.generation))
-                    } else {
-                        // This can only happen if we forgot to to schedule the file upload
-                        // before scheduling the delete. Log it because it is a rare/strange
-                        // situation, and in case something is misbehaving, we'd like to know which
-                        // layers experienced this.
-                        info!(
-                            "Deleting layer {name} not found in latest_files list, never uploaded?"
-                        );
-                        None
-                    }
-                })
-                .collect();
+                if let Some(meta) = meta {
+                    upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
+                    Some((name, meta.generation))
+                } else {
+                    // This can only happen if we forgot to to schedule the file upload
+                    // before scheduling the delete. Log it because it is a rare/strange
+                    // situation, and in case something is misbehaving, we'd like to know which
+                    // layers experienced this.
+                    info!("Deleting layer {name} not found in latest_files list, never uploaded?");
+                    None
+                }
+            })
+            .collect();

-            if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-                self.schedule_index_upload(upload_queue, metadata);
+        #[cfg(feature = "testing")]
+        for (name, gen) in &with_generations {
+            if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), *gen) {
+                if &unexpected == gen {
+                    tracing::error!("{name} was unlinked twice with same generation");
+                } else {
+                    tracing::error!("{name} was unlinked twice with different generations {gen:?} and {unexpected:?}");
+                }
            }
+        }

-            for (name, gen) in &with_generations {
-                info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
+        // after unlinking files from the upload_queue.latest_files we must always schedule an
+        // index_part update, because that needs to be uploaded before we can actually delete the
+        // files.
+        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
+            self.schedule_index_upload(upload_queue, metadata);
+        }
+
+        with_generations
+    }
+
+    /// Schedules deletion for layer files which have previously been unlinked from the
+    /// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`].
+    pub(crate) fn schedule_deletion_of_unlinked(
+        self: &Arc<Self>,
+        layers: Vec<(LayerFileName, Generation)>,
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+
+        self.schedule_deletion_of_unlinked0(upload_queue, layers);
+        self.launch_queued_tasks(upload_queue);
+        Ok(())
+    }
+
+    fn schedule_deletion_of_unlinked0(
+        self: &Arc<Self>,
+        upload_queue: &mut UploadQueueInitialized,
+        with_generations: Vec<(LayerFileName, Generation)>,
+    ) {
+        for (name, gen) in &with_generations {
+            info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
+        }
+
+        #[cfg(feature = "testing")]
+        for (name, gen) in &with_generations {
+            match upload_queue.dangling_files.remove(name) {
+                Some(same) if &same == gen => { /* expected */ }
+                Some(other) => {
+                    tracing::error!("{name} was unlinked with {other:?} but deleted with {gen:?}");
+                }
+                None => {
+                    tracing::error!("{name} was unlinked but was not dangling");
+                }
            }
+        }

-            // schedule the actual deletions
-            let op = UploadOp::Delete(Delete {
-                layers: with_generations,
-            });
-            self.calls_unfinished_metric_begin(&op);
-            upload_queue.queued_operations.push_back(op);
+        // schedule the actual deletions
+        let op = UploadOp::Delete(Delete {
+            layers: with_generations,
+        });
+        self.calls_unfinished_metric_begin(&op);
+        upload_queue.queued_operations.push_back(op);
+    }
+
+    /// Schedules a compaction update to the remote `index_part.json`.
+    ///
+    /// `compacted_from` represent the L0 names which have been `compacted_to` L1 layers.
+    pub(crate) fn schedule_compaction_update(
+        self: &Arc<Self>,
+        compacted_from: &[Layer],
+        compacted_to: &[ResidentLayer],
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+
+        for layer in compacted_to {
+            self.schedule_layer_file_upload0(upload_queue, layer.clone());
+        }
+
+        let names = compacted_from.iter().map(|x| x.layer_desc().filename());
+
+        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
+        self.launch_queued_tasks(upload_queue);

-            // Launch the tasks immediately, if possible
-            self.launch_queued_tasks(upload_queue);
-        };
-        no_bail_here();
        Ok(())
    }

@@ -1093,16 +1177,12 @@ impl RemoteTimelineClient {
            }

            let upload_result: anyhow::Result<()> = match &task.op {
-                UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => {
-                    let path = self
-                        .conf
-                        .timeline_path(&self.tenant_id, &self.timeline_id)
-                        .join(layer_file_name.file_name());
-
+                UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
+                    let path = layer.local_path();
                    upload::upload_timeline_layer(
                        self.conf,
                        &self.storage_impl,
-                        &path,
+                        path,
                        layer_metadata,
                        self.generation,
                    )
@@ -1376,6 +1456,8 @@ impl RemoteTimelineClient {
                        num_inprogress_deletions: 0,
                        inprogress_tasks: HashMap::default(),
                        queued_operations: VecDeque::default(),
+                        #[cfg(feature = "testing")]
+                        dangling_files: HashMap::default(),
                    };

                    let upload_queue = std::mem::replace(
@@ -1506,6 +1588,7 @@ mod tests {
        context::RequestContext,
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
+            storage_layer::Layer,
            Generation, Tenant, Timeline,
        },
        DEFAULT_PG_VERSION,
@@ -1648,7 +1731,11 @@ mod tests {
        let client = timeline.remote_client.as_ref().unwrap();

        // Download back the index.json, and check that the list of files is correct
-        let initial_index_part = match client.download_index_file().await.unwrap() {
+        let initial_index_part = match client
+            .download_index_file(CancellationToken::new())
+            .await
+            .unwrap()
+        {
            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
        };
@@ -1674,32 +1761,29 @@ mod tests {
        let generation = harness.generation;

        // Create a couple of dummy files,  schedule upload for them
-        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
-        let layer_file_name_2: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap();
-        let layer_file_name_3: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap();
-        let content_1 = dummy_contents("foo");
-        let content_2 = dummy_contents("bar");
-        let content_3 = dummy_contents("baz");

-        for (filename, content) in [
-            (&layer_file_name_1, &content_1),
-            (&layer_file_name_2, &content_2),
-            (&layer_file_name_3, &content_3),
-        ] {
-            std::fs::write(timeline_path.join(filename.file_name()), content).unwrap();
-        }
+        let layers = [
+            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), dummy_contents("foo")),
+            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap(), dummy_contents("bar")),
+            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz"))
+        ]
+        .into_iter()
+        .map(|(name, contents): (LayerFileName, Vec<u8>)| {
+            std::fs::write(timeline_path.join(name.file_name()), &contents).unwrap();
+
+            Layer::for_resident(
+                harness.conf,
+                &timeline,
+                name,
+                LayerFileMetadata::new(contents.len() as u64, generation),
+            )
+        }).collect::<Vec<_>>();

        client
-            .schedule_layer_file_upload(
-                &layer_file_name_1,
-                &LayerFileMetadata::new(content_1.len() as u64, generation),
-            )
+            .schedule_layer_file_upload(layers[0].clone())
            .unwrap();
        client
-            .schedule_layer_file_upload(
-                &layer_file_name_2,
-                &LayerFileMetadata::new(content_2.len() as u64, generation),
-            )
+            .schedule_layer_file_upload(layers[1].clone())
            .unwrap();

        // Check that they are started immediately, not queued
@@ -1740,7 +1824,11 @@ mod tests {
        }

        // Download back the index.json, and check that the list of files is correct
-        let index_part = match client.download_index_file().await.unwrap() {
+        let index_part = match client
+            .download_index_file(CancellationToken::new())
+            .await
+            .unwrap()
+        {
            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
        };
@@ -1753,38 +1841,42 @@ mod tests {
                .collect(),
            &[
                &initial_layer.file_name(),
-                &layer_file_name_1.file_name(),
-                &layer_file_name_2.file_name(),
+                &layers[0].layer_desc().filename().file_name(),
+                &layers[1].layer_desc().filename().file_name(),
            ],
        );
        assert_eq!(index_part.metadata, metadata);

        // Schedule upload and then a deletion. Check that the deletion is queued
        client
-            .schedule_layer_file_upload(
-                &layer_file_name_3,
-                &LayerFileMetadata::new(content_3.len() as u64, generation),
-            )
+            .schedule_layer_file_upload(layers[2].clone())
            .unwrap();
+
+        // this is no longer consistent with how deletion works with Layer::drop, but in this test
+        // keep using schedule_layer_file_deletion because we don't have a way to wait for the
+        // spawn_blocking started by the drop.
        client
-            .schedule_layer_file_deletion([layer_file_name_1.clone()].to_vec())
+            .schedule_layer_file_deletion(&[layers[0].layer_desc().filename()])
            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();

            // Deletion schedules upload of the index file, and the file deletion itself
-            assert!(upload_queue.queued_operations.len() == 2);
-            assert!(upload_queue.inprogress_tasks.len() == 1);
-            assert!(upload_queue.num_inprogress_layer_uploads == 1);
-            assert!(upload_queue.num_inprogress_deletions == 0);
-            assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
+            assert_eq!(upload_queue.queued_operations.len(), 2);
+            assert_eq!(upload_queue.inprogress_tasks.len(), 1);
+            assert_eq!(upload_queue.num_inprogress_layer_uploads, 1);
+            assert_eq!(upload_queue.num_inprogress_deletions, 0);
+            assert_eq!(
+                upload_queue.latest_files_changes_since_metadata_upload_scheduled,
+                0
+            );
        }
        assert_remote_files(
            &[
                &initial_layer.file_name(),
-                &layer_file_name_1.file_name(),
-                &layer_file_name_2.file_name(),
+                &layers[0].layer_desc().filename().file_name(),
+                &layers[1].layer_desc().filename().file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
@@ -1798,8 +1890,8 @@ mod tests {
        assert_remote_files(
            &[
                &initial_layer.file_name(),
-                &layer_file_name_2.file_name(),
-                &layer_file_name_3.file_name(),
+                &layers[1].layer_desc().filename().file_name(),
+                &layers[2].layer_desc().filename().file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
@@ -1828,6 +1920,13 @@ mod tests {
        )
        .unwrap();

+        let layer_file_1 = Layer::for_resident(
+            harness.conf,
+            &timeline,
+            layer_file_name_1.clone(),
+            LayerFileMetadata::new(content_1.len() as u64, harness.generation),
+        );
+
        #[derive(Debug, PartialEq, Clone, Copy)]
        struct BytesStartedFinished {
            started: Option<usize>,
@@ -1863,10 +1962,7 @@ mod tests {
        let actual_a = get_bytes_started_stopped();

        client
-            .schedule_layer_file_upload(
-                &layer_file_name_1,
-                &LayerFileMetadata::new(content_1.len() as u64, harness.generation),
-            )
+            .schedule_layer_file_upload(layer_file_1.clone())
            .unwrap();

        let actual_b = get_bytes_started_stopped();
@@ -1931,7 +2027,7 @@ mod tests {
        let client = test_state.build_client(get_generation);

        let download_r = client
-            .download_index_file()
+            .download_index_file(CancellationToken::new())
            .await
            .expect("download should always succeed");
        assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_)));
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -19,7 +19,7 @@ use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::Generation;
-use remote_storage::{DownloadError, GenericRemoteStorage};
+use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};

@@ -170,47 +170,43 @@ pub fn is_temp_download_file(path: &Utf8Path) -> bool {
 pub async fn list_remote_timelines(
    storage: &GenericRemoteStorage,
    tenant_id: TenantId,
-) -> anyhow::Result<HashSet<TimelineId>> {
+    cancel: CancellationToken,
+) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
    let remote_path = remote_timelines_path(&tenant_id);

    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
        anyhow::bail!("storage-sync-list-remote-timelines");
    });

-    let timelines = download_retry(
-        || storage.list_prefixes(Some(&remote_path)),
-        &format!("list prefixes for {tenant_id}"),
+    let listing = download_retry_forever(
+        || storage.list(Some(&remote_path), ListingMode::WithDelimiter),
+        &format!("list timelines for {tenant_id}"),
+        cancel,
    )
    .await?;

-    if timelines.is_empty() {
-        anyhow::bail!("no timelines found on the remote storage")
-    }
-
    let mut timeline_ids = HashSet::new();
+    let mut other_prefixes = HashSet::new();

-    for timeline_remote_storage_key in timelines {
+    for timeline_remote_storage_key in listing.prefixes {
        let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
            anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
        })?;

-        let timeline_id: TimelineId = object_name
-            .parse()
-            .with_context(|| format!("parse object name into timeline id '{object_name}'"))?;
-
-        // list_prefixes is assumed to return unique names. Ensure this here.
-        // NB: it's safer to bail out than warn-log this because the pageserver
-        //     needs to absolutely know about _all_ timelines that exist, so that
-        //     GC knows all the branchpoints. If we skipped over a timeline instead,
-        //     GC could delete a layer that's still needed by that timeline.
-        anyhow::ensure!(
-            !timeline_ids.contains(&timeline_id),
-            "list_prefixes contains duplicate timeline id {timeline_id}"
-        );
-        timeline_ids.insert(timeline_id);
+        match object_name.parse::<TimelineId>() {
+            Ok(t) => timeline_ids.insert(t),
+            Err(_) => other_prefixes.insert(object_name.to_string()),
+        };
    }

-    Ok(timeline_ids)
+    for key in listing.keys {
+        let object_name = key
+            .object_name()
+            .ok_or_else(|| anyhow::anyhow!("object name for key {key}"))?;
+        other_prefixes.insert(object_name.to_string());
+    }
+
+    Ok((timeline_ids, other_prefixes))
 }

 async fn do_download_index_part(
@@ -218,10 +214,11 @@ async fn do_download_index_part(
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
    index_generation: Generation,
+    cancel: CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
    let remote_path = remote_index_path(tenant_id, timeline_id, index_generation);

-    let index_part_bytes = download_retry(
+    let index_part_bytes = download_retry_forever(
        || async {
            let mut index_part_download = storage.download(&remote_path).await?;

@@ -236,6 +233,7 @@ async fn do_download_index_part(
            Ok(index_part_bytes)
        },
        &format!("download {remote_path:?}"),
+        cancel,
    )
    .await?;

@@ -257,19 +255,28 @@ pub(super) async fn download_index_part(
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
    my_generation: Generation,
+    cancel: CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

    if my_generation.is_none() {
        // Operating without generations: just fetch the generation-less path
-        return do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
+        return do_download_index_part(storage, tenant_id, timeline_id, my_generation, cancel)
+            .await;
    }

    // Stale case: If we were intentionally attached in a stale generation, there may already be a remote
    // index in our generation.
    //
    // This is an optimization to avoid doing the listing for the general case below.
-    let res = do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
+    let res = do_download_index_part(
+        storage,
+        tenant_id,
+        timeline_id,
+        my_generation,
+        cancel.clone(),
+    )
+    .await;
    match res {
        Ok(index_part) => {
            tracing::debug!(
@@ -289,8 +296,14 @@ pub(super) async fn download_index_part(
    //    we want to find the most recent index from a previous generation.
    //
    // This is an optimization to avoid doing the listing for the general case below.
-    let res =
-        do_download_index_part(storage, tenant_id, timeline_id, my_generation.previous()).await;
+    let res = do_download_index_part(
+        storage,
+        tenant_id,
+        timeline_id,
+        my_generation.previous(),
+        cancel.clone(),
+    )
+    .await;
    match res {
        Ok(index_part) => {
            tracing::debug!("Found index_part from previous generation");
@@ -334,13 +347,14 @@ pub(super) async fn download_index_part(
    match max_previous_generation {
        Some(g) => {
            tracing::debug!("Found index_part in generation {g:?}");
-            do_download_index_part(storage, tenant_id, timeline_id, g).await
+            do_download_index_part(storage, tenant_id, timeline_id, g, cancel).await
        }
        None => {
            // Migration from legacy pre-generation state: we have a generation but no prior
            // attached pageservers did.  Try to load from a no-generation path.
            tracing::info!("No index_part.json* found");
-            do_download_index_part(storage, tenant_id, timeline_id, Generation::none()).await
+            do_download_index_part(storage, tenant_id, timeline_id, Generation::none(), cancel)
+                .await
        }
    }
 }
@@ -370,3 +384,23 @@ where
    )
    .await
 }
+
+async fn download_retry_forever<T, O, F>(
+    op: O,
+    description: &str,
+    cancel: CancellationToken,
+) -> Result<T, DownloadError>
+where
+    O: FnMut() -> F,
+    F: Future<Output = Result<T, DownloadError>>,
+{
+    backoff::retry(
+        op,
+        |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
+        FAILED_DOWNLOAD_WARN_THRESHOLD,
+        u32::MAX,
+        description,
+        backoff::Cancel::new(cancel, || DownloadError::Cancelled),
+    )
+    .await
+}
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -98,7 +98,7 @@ impl IndexPart {
    const LATEST_VERSION: usize = 4;

    // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &[usize] = &[1, 2, 3, 4];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4];

    pub const FILE_NAME: &'static str = "index_part.json";

--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -60,6 +60,8 @@ pub(super) async fn upload_timeline_layer<'a>(
        bail!("failpoint before-upload-layer")
    });

+    pausable_failpoint!("before-upload-layer-pausable");
+
    let storage_path = remote_path(conf, source_path, generation)?;
    let source_file_res = fs::File::open(&source_path).await;
    let source_file = match source_file_res {
@@ -70,6 +72,8 @@ pub(super) async fn upload_timeline_layer<'a>(
            // upload. However, a nonexistent file can also be indicative of
            // something worse, like when a file is scheduled for upload before
            // it has been written to disk yet.
+            //
+            // This is tested against `test_compaction_delete_before_upload`
            info!(path = %source_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
            return Ok(());
        }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -4,26 +4,21 @@ pub mod delta_layer;
 mod filename;
 mod image_layer;
 mod inmemory_layer;
+mod layer;
 mod layer_desc;
-mod remote_layer;

-use crate::config::PageServerConf;
 use crate::context::{AccessStatsBehavior, RequestContext};
-use crate::repository::Key;
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
-use anyhow::Result;
 use bytes::Bytes;
-use camino::Utf8PathBuf;
 use enum_map::EnumMap;
 use enumset::EnumSet;
 use once_cell::sync::Lazy;
-use pageserver_api::models::LayerAccessKind;
 use pageserver_api::models::{
-    HistoricLayerInfo, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
+    LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
 };
 use std::ops::Range;
-use std::sync::{Arc, Mutex};
+use std::sync::Mutex;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 use tracing::warn;
 use utils::history_buffer::HistoryBufferWithDropCounter;
@@ -39,7 +34,8 @@ pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
 pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
-pub use remote_layer::RemoteLayer;
+
+pub(crate) use layer::{EvictionError, Layer, ResidentLayer};

 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
@@ -74,7 +70,7 @@ pub struct ValueReconstructState {
    pub img: Option<(Lsn, Bytes)>,
 }

-/// Return value from Layer::get_page_reconstruct_data
+/// Return value from [`Layer::get_value_reconstruct_data`]
 #[derive(Clone, Copy, Debug)]
 pub enum ValueReconstructResult {
    /// Got all the data needed to reconstruct the requested page
@@ -179,26 +175,6 @@ impl LayerAccessStats {
        new
    }

-    /// Creates a clone of `self` and records `new_status` in the clone.
-    ///
-    /// The `new_status` is not recorded in `self`.
-    ///
-    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
-    ///
-    /// [`record_residence_event`]: Self::record_residence_event
-    pub(crate) fn clone_for_residence_change(
-        &self,
-        new_status: LayerResidenceStatus,
-    ) -> LayerAccessStats {
-        let clone = {
-            let inner = self.0.lock().unwrap();
-            inner.clone()
-        };
-        let new = LayerAccessStats(Mutex::new(clone));
-        new.record_residence_event(new_status, LayerResidenceEventReason::ResidenceChange);
-        new
-    }
-
    /// Record a change in layer residency.
    ///
    /// Recording the event must happen while holding the layer map lock to
@@ -321,95 +297,12 @@ impl LayerAccessStats {
    }
 }

-/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
-/// required by [`LayerMap`](super::layer_map::LayerMap).
-///
-/// All layers should implement a minimal `std::fmt::Debug` without tenant or
-/// timeline names, because those are known in the context of which the layers
-/// are used in (timeline).
-#[async_trait::async_trait]
-pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
-    ///
-    /// Return data needed to reconstruct given page at LSN.
-    ///
-    /// It is up to the caller to collect more data from previous layer and
-    /// perform WAL redo, if necessary.
-    ///
-    /// See PageReconstructResult for possible return values. The collected data
-    /// is appended to reconstruct_data; the caller should pass an empty struct
-    /// on first call, or a struct with a cached older image of the page if one
-    /// is available. If this returns ValueReconstructResult::Continue, look up
-    /// the predecessor layer and call again with the same 'reconstruct_data' to
-    /// collect more data.
-    async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_data: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> Result<ValueReconstructResult>;
-}
-
 /// Get a layer descriptor from a layer.
 pub trait AsLayerDesc {
    /// Get the layer descriptor.
    fn layer_desc(&self) -> &PersistentLayerDesc;
 }

-/// A Layer contains all data in a "rectangle" consisting of a range of keys and
-/// range of LSNs.
-///
-/// There are two kinds of layers, in-memory and on-disk layers. In-memory
-/// layers are used to ingest incoming WAL, and provide fast access to the
-/// recent page versions. On-disk layers are stored as files on disk, and are
-/// immutable. This trait presents the common functionality of in-memory and
-/// on-disk layers.
-///
-/// Furthermore, there are two kinds of on-disk layers: delta and image layers.
-/// A delta layer contains all modifications within a range of LSNs and keys.
-/// An image layer is a snapshot of all the data in a key-range, at a single
-/// LSN.
-pub trait PersistentLayer: Layer + AsLayerDesc {
-    /// File name used for this layer, both in the pageserver's local filesystem
-    /// state as well as in the remote storage.
-    fn filename(&self) -> LayerFileName {
-        self.layer_desc().filename()
-    }
-
-    // Path to the layer file in the local filesystem.
-    // `None` for `RemoteLayer`.
-    fn local_path(&self) -> Option<Utf8PathBuf>;
-
-    /// Permanently remove this layer from disk.
-    fn delete_resident_layer_file(&self) -> Result<()>;
-
-    fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
-        None
-    }
-
-    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
-        None
-    }
-
-    fn is_remote_layer(&self) -> bool {
-        false
-    }
-
-    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;
-
-    fn access_stats(&self) -> &LayerAccessStats;
-}
-
-pub fn downcast_remote_layer(
-    layer: &Arc<dyn PersistentLayer>,
-) -> Option<std::sync::Arc<RemoteLayer>> {
-    if layer.is_remote_layer() {
-        Arc::clone(layer).downcast_remote_layer()
-    } else {
-        None
-    }
-}
-
 pub mod tests {
    use super::*;

@@ -447,19 +340,6 @@ pub mod tests {
    }
 }

-/// Helper enum to hold a PageServerConf, or a path
-///
-/// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the
-/// global config, and paths to layer files are constructed using the tenant/timeline
-/// path from the config. But in the 'pagectl' binary, we need to construct a Layer
-/// struct for a file on disk, without having a page server running, so that we have no
-/// config. In that case, we use the Path variant to hold the full path to the file on
-/// disk.
-enum PathOrConf {
-    Path(Utf8PathBuf),
-    Conf(&'static PageServerConf),
-}
-
 /// Range wrapping newtype, which uses display to render Debug.
 ///
 /// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -34,18 +34,17 @@ use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
-use crate::tenant::storage_layer::{
-    PersistentLayer, ValueReconstructResult, ValueReconstructState,
-};
+use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
+use crate::tenant::Timeline;
 use crate::virtual_file::VirtualFile;
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
 use camino::{Utf8Path, Utf8PathBuf};
-use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
+use pageserver_api::models::LayerAccessKind;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs::{self, File};
+use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
@@ -59,10 +58,7 @@ use utils::{
    lsn::Lsn,
 };

-use super::{
-    AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, PathOrConf,
-    PersistentLayerDesc,
-};
+use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer};

 ///
 /// Header stored in the beginning of the file
@@ -182,20 +178,12 @@ impl DeltaKey {
    }
 }

-/// DeltaLayer is the in-memory data structure associated with an on-disk delta
-/// file.
-///
-/// We keep a DeltaLayer in memory for each file, in the LayerMap. If a layer
-/// is in "loaded" state, we have a copy of the index in memory, in 'inner'.
-/// Otherwise the struct is just a placeholder for a file that exists on disk,
-/// and it needs to be loaded before using it in queries.
+/// This is used only from `pagectl`. Within pageserver, all layers are
+/// [`crate::tenant::storage_layer::Layer`], which can hold a [`DeltaLayerInner`].
 pub struct DeltaLayer {
-    path_or_conf: PathOrConf,
-
+    path: Utf8PathBuf,
    pub desc: PersistentLayerDesc,
-
    access_stats: LayerAccessStats,
-
    inner: OnceCell<Arc<DeltaLayerInner>>,
 }

@@ -212,6 +200,8 @@ impl std::fmt::Debug for DeltaLayer {
    }
 }

+/// `DeltaLayerInner` is the in-memory data structure associated with an on-disk delta
+/// file.
 pub struct DeltaLayerInner {
    // values copied from summary
    index_start_blk: u32,
@@ -221,12 +211,6 @@ pub struct DeltaLayerInner {
    file: FileBlockReader,
 }

-impl AsRef<DeltaLayerInner> for DeltaLayerInner {
-    fn as_ref(&self) -> &DeltaLayerInner {
-        self
-    }
-}
-
 impl std::fmt::Debug for DeltaLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("DeltaLayerInner")
@@ -236,19 +220,6 @@ impl std::fmt::Debug for DeltaLayerInner {
    }
 }

-#[async_trait::async_trait]
-impl Layer for DeltaLayer {
-    async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
-            .await
-    }
-}
 /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
 impl std::fmt::Display for DeltaLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -262,40 +233,9 @@ impl AsLayerDesc for DeltaLayer {
    }
 }

-impl PersistentLayer for DeltaLayer {
-    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
-        Some(self)
-    }
-
-    fn local_path(&self) -> Option<Utf8PathBuf> {
-        self.local_path()
-    }
-
-    fn delete_resident_layer_file(&self) -> Result<()> {
-        self.delete_resident_layer_file()
-    }
-
-    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        self.info(reset)
-    }
-
-    fn access_stats(&self) -> &LayerAccessStats {
-        self.access_stats()
-    }
-}
-
 impl DeltaLayer {
    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
-        println!(
-            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
-            self.desc.tenant_id,
-            self.desc.timeline_id,
-            self.desc.key_range.start,
-            self.desc.key_range.end,
-            self.desc.lsn_range.start,
-            self.desc.lsn_range.end,
-            self.desc.file_size,
-        );
+        self.desc.dump();

        if !verbose {
            return Ok(());
@@ -303,119 +243,7 @@ impl DeltaLayer {

        let inner = self.load(LayerAccessKind::Dump, ctx).await?;

-        println!(
-            "index_start_blk: {}, root {}",
-            inner.index_start_blk, inner.index_root_blk
-        );
-
-        let file = &inner.file;
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            inner.index_start_blk,
-            inner.index_root_blk,
-            file,
-        );
-
-        tree_reader.dump().await?;
-
-        let keys = DeltaLayerInner::load_keys(&inner, ctx).await?;
-
-        // A subroutine to dump a single blob
-        async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> Result<String> {
-            let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
-            let val = Value::des(&buf)?;
-            let desc = match val {
-                Value::Image(img) => {
-                    format!(" img {} bytes", img.len())
-                }
-                Value::WalRecord(rec) => {
-                    let wal_desc = walrecord::describe_wal_record(&rec)?;
-                    format!(
-                        " rec {} bytes will_init: {} {}",
-                        buf.len(),
-                        rec.will_init(),
-                        wal_desc
-                    )
-                }
-            };
-            Ok(desc)
-        }
-
-        for entry in keys {
-            let DeltaEntry { key, lsn, val, .. } = entry;
-            let desc = match dump_blob(val, ctx).await {
-                Ok(desc) => desc,
-                Err(err) => {
-                    let err: anyhow::Error = err;
-                    format!("ERROR: {err}")
-                }
-            };
-            println!("  key {key} at {lsn}: {desc}");
-        }
-
-        Ok(())
-    }
-
-    pub(crate) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        ensure!(lsn_range.start >= self.desc.lsn_range.start);
-
-        ensure!(self.desc.key_range.contains(&key));
-
-        let inner = self
-            .load(LayerAccessKind::GetValueReconstructData, ctx)
-            .await?;
-        inner
-            .get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
-            .await
-    }
-
-    pub(crate) fn local_path(&self) -> Option<Utf8PathBuf> {
-        Some(self.path())
-    }
-
-    pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
-        // delete underlying file
-        fs::remove_file(self.path())?;
-        Ok(())
-    }
-
-    pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        let layer_file_name = self.layer_desc().filename().file_name();
-        let lsn_range = self.layer_desc().lsn_range.clone();
-
-        let access_stats = self.access_stats.as_api_model(reset);
-
-        HistoricLayerInfo::Delta {
-            layer_file_name,
-            layer_file_size: self.desc.file_size,
-            lsn_start: lsn_range.start,
-            lsn_end: lsn_range.end,
-            remote: false,
-            access_stats,
-        }
-    }
-
-    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
-        &self.access_stats
-    }
-
-    fn path_for(
-        path_or_conf: &PathOrConf,
-        tenant_id: &TenantId,
-        timeline_id: &TimelineId,
-        fname: &DeltaFileName,
-    ) -> Utf8PathBuf {
-        match path_or_conf {
-            PathOrConf::Path(path) => path.clone(),
-            PathOrConf::Conf(conf) => conf
-                .timeline_path(tenant_id, timeline_id)
-                .join(fname.to_string()),
-        }
+        inner.dump(ctx).await
    }

    fn temp_path_for(
@@ -461,52 +289,21 @@ impl DeltaLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
        let path = self.path();

-        let summary = match &self.path_or_conf {
-            PathOrConf::Conf(_) => Some(Summary::from(self)),
-            PathOrConf::Path(_) => None,
-        };
+        let loaded = DeltaLayerInner::load(&path, None, ctx).await?;

-        let loaded = DeltaLayerInner::load(&path, summary, ctx).await?;
+        // not production code
+        let actual_filename = path.file_name().unwrap().to_owned();
+        let expected_filename = self.layer_desc().filename().file_name();

-        if let PathOrConf::Path(ref path) = self.path_or_conf {
-            // not production code
-
-            let actual_filename = path.file_name().unwrap().to_owned();
-            let expected_filename = self.filename().file_name();
-
-            if actual_filename != expected_filename {
-                println!("warning: filename does not match what is expected from in-file summary");
-                println!("actual: {:?}", actual_filename);
-                println!("expected: {:?}", expected_filename);
-            }
+        if actual_filename != expected_filename {
+            println!("warning: filename does not match what is expected from in-file summary");
+            println!("actual: {:?}", actual_filename);
+            println!("expected: {:?}", expected_filename);
        }

        Ok(Arc::new(loaded))
    }

-    /// Create a DeltaLayer struct representing an existing file on disk.
-    pub fn new(
-        conf: &'static PageServerConf,
-        timeline_id: TimelineId,
-        tenant_id: TenantId,
-        filename: &DeltaFileName,
-        file_size: u64,
-        access_stats: LayerAccessStats,
-    ) -> DeltaLayer {
-        DeltaLayer {
-            path_or_conf: PathOrConf::Conf(conf),
-            desc: PersistentLayerDesc::new_delta(
-                tenant_id,
-                timeline_id,
-                filename.key_range.clone(),
-                filename.lsn_range.clone(),
-                file_size,
-            ),
-            access_stats,
-            inner: OnceCell::new(),
-        }
-    }
-
    /// Create a DeltaLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
@@ -520,7 +317,7 @@ impl DeltaLayer {
            .context("get file metadata to determine size")?;

        Ok(DeltaLayer {
-            path_or_conf: PathOrConf::Path(path.to_path_buf()),
+            path: path.to_path_buf(),
            desc: PersistentLayerDesc::new_delta(
                summary.tenant_id,
                summary.timeline_id,
@@ -533,29 +330,9 @@ impl DeltaLayer {
        })
    }

-    fn layer_name(&self) -> DeltaFileName {
-        self.desc.delta_file_name()
-    }
    /// Path to the layer file in pageserver workdir.
-    pub fn path(&self) -> Utf8PathBuf {
-        Self::path_for(
-            &self.path_or_conf,
-            &self.desc.tenant_id,
-            &self.desc.timeline_id,
-            &self.layer_name(),
-        )
-    }
-    /// Loads all keys stored in the layer. Returns key, lsn, value size and value reference.
-    ///
-    /// The value can be obtained via the [`ValueRef::load`] function.
-    pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<DeltaEntry<'_>>> {
-        let inner = self
-            .load(LayerAccessKind::KeyIter, ctx)
-            .await
-            .context("load delta layer keys")?;
-        DeltaLayerInner::load_keys(inner, ctx)
-            .await
-            .context("Layer index is corrupted")
+    fn path(&self) -> Utf8PathBuf {
+        self.path.clone()
    }
 }

@@ -660,7 +437,7 @@ impl DeltaLayerWriterInner {
    ///
    /// Finish writing the delta layer.
    ///
-    async fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
+    async fn finish(self, key_end: Key, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -717,37 +494,21 @@ impl DeltaLayerWriterInner {
        // Note: Because we opened the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
-        let layer = DeltaLayer {
-            path_or_conf: PathOrConf::Conf(self.conf),
-            desc: PersistentLayerDesc::new_delta(
-                self.tenant_id,
-                self.timeline_id,
-                self.key_start..key_end,
-                self.lsn_range.clone(),
-                metadata.len(),
-            ),
-            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: OnceCell::new(),
-        };
+
+        let desc = PersistentLayerDesc::new_delta(
+            self.tenant_id,
+            self.timeline_id,
+            self.key_start..key_end,
+            self.lsn_range.clone(),
+            metadata.len(),
+        );

        // fsync the file
        file.sync_all().await?;
-        // Rename the file to its final name
-        //
-        // Note: This overwrites any existing file. There shouldn't be any.
-        // FIXME: throw an error instead?
-        let final_path = DeltaLayer::path_for(
-            &PathOrConf::Conf(self.conf),
-            &self.tenant_id,
-            &self.timeline_id,
-            &DeltaFileName {
-                key_range: self.key_start..key_end,
-                lsn_range: self.lsn_range,
-            },
-        );
-        std::fs::rename(self.path, &final_path)?;

-        trace!("created delta layer {final_path}");
+        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
+
+        trace!("created delta layer {}", layer.local_path());

        Ok(layer)
    }
@@ -828,8 +589,12 @@ impl DeltaLayerWriter {
    ///
    /// Finish writing the delta layer.
    ///
-    pub async fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
-        self.inner.take().unwrap().finish(key_end).await
+    pub(crate) async fn finish(
+        mut self,
+        key_end: Key,
+        timeline: &Arc<Timeline>,
+    ) -> anyhow::Result<ResidentLayer> {
+        self.inner.take().unwrap().finish(key_end, timeline).await
    }
 }

@@ -967,15 +732,17 @@ impl DeltaLayerInner {
        }
    }

-    pub(super) async fn load_keys<'a, 'b, T: AsRef<DeltaLayerInner> + Clone>(
-        this: &'a T,
-        ctx: &'b RequestContext,
+    pub(super) async fn load_keys<'a>(
+        &'a self,
+        ctx: &RequestContext,
    ) -> Result<Vec<DeltaEntry<'a>>> {
-        let dl = this.as_ref();
-        let file = &dl.file;
+        let file = &self.file;

-        let tree_reader =
-            DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            file,
+        );

        let mut all_keys: Vec<DeltaEntry<'_>> = Vec::new();

@@ -988,7 +755,7 @@ impl DeltaLayerInner {
                    let val_ref = ValueRef {
                        blob_ref: BlobRef(value),
                        reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
-                            Adapter(dl),
+                            Adapter(self),
                        )),
                    };
                    let pos = BlobRef(value).pos();
@@ -1015,10 +782,61 @@ impl DeltaLayerInner {
        if let Some(last) = all_keys.last_mut() {
            // Last key occupies all space till end of value storage,
            // which corresponds to beginning of the index
-            last.size = dl.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
+            last.size = self.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
        }
        Ok(all_keys)
    }
+
+    pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
+        println!(
+            "index_start_blk: {}, root {}",
+            self.index_start_blk, self.index_root_blk
+        );
+
+        let file = &self.file;
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            file,
+        );
+
+        tree_reader.dump().await?;
+
+        let keys = self.load_keys(ctx).await?;
+
+        async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
+            let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
+            let val = Value::des(&buf)?;
+            let desc = match val {
+                Value::Image(img) => {
+                    format!(" img {} bytes", img.len())
+                }
+                Value::WalRecord(rec) => {
+                    let wal_desc = walrecord::describe_wal_record(&rec)?;
+                    format!(
+                        " rec {} bytes will_init: {} {}",
+                        buf.len(),
+                        rec.will_init(),
+                        wal_desc
+                    )
+                }
+            };
+            Ok(desc)
+        }
+
+        for entry in keys {
+            let DeltaEntry { key, lsn, val, .. } = entry;
+            let desc = match dump_blob(val, ctx).await {
+                Ok(desc) => desc,
+                Err(err) => {
+                    format!("ERROR: {err}")
+                }
+            };
+            println!("  key {key} at {lsn}: {desc}");
+        }
+
+        Ok(())
+    }
 }

 /// A set of data associated with a delta layer key and its value
@@ -1058,3 +876,9 @@ impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
        self.0.as_ref().file.read_blk(blknum, ctx).await
    }
 }
+
+impl AsRef<DeltaLayerInner> for DeltaLayerInner {
+    fn as_ref(&self) -> &DeltaLayerInner {
+        self
+    }
+}
--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
@@ -226,6 +226,14 @@ impl LayerFileName {
            _ => false,
        }
    }
+
+    pub(crate) fn kind(&self) -> &'static str {
+        use LayerFileName::*;
+        match self {
+            Delta(_) => "delta",
+            Image(_) => "image",
+        }
+    }
 }

 impl fmt::Display for LayerFileName {
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -31,21 +31,23 @@ use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
-    LayerAccessStats, PersistentLayer, ValueReconstructResult, ValueReconstructState,
+    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
 };
+use crate::tenant::Timeline;
 use crate::virtual_file::VirtualFile;
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
-use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
+use pageserver_api::models::LayerAccessKind;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs::{self, File};
+use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
+use std::sync::Arc;
 use tokio::sync::OnceCell;
 use tracing::*;

@@ -56,7 +58,7 @@ use utils::{
 };

 use super::filename::ImageFileName;
-use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLayerDesc};
+use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer};

 ///
 /// Header stored in the beginning of the file
@@ -114,22 +116,14 @@ impl Summary {
    }
 }

-/// ImageLayer is the in-memory data structure associated with an on-disk image
-/// file.
-///
-/// We keep an ImageLayer in memory for each file, in the LayerMap. If a layer
-/// is in "loaded" state, we have a copy of the index in memory, in 'inner'.
-/// Otherwise the struct is just a placeholder for a file that exists on disk,
-/// and it needs to be loaded before using it in queries.
+/// This is used only from `pagectl`. Within pageserver, all layers are
+/// [`crate::tenant::storage_layer::Layer`], which can hold an [`ImageLayerInner`].
 pub struct ImageLayer {
-    path_or_conf: PathOrConf,
-
+    path: Utf8PathBuf,
    pub desc: PersistentLayerDesc,
    // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
    pub lsn: Lsn,
-
    access_stats: LayerAccessStats,
-
    inner: OnceCell<ImageLayerInner>,
 }

@@ -146,6 +140,8 @@ impl std::fmt::Debug for ImageLayer {
    }
 }

+/// ImageLayer is the in-memory data structure associated with an on-disk image
+/// file.
 pub struct ImageLayerInner {
    // values copied from summary
    index_start_blk: u32,
@@ -166,73 +162,11 @@ impl std::fmt::Debug for ImageLayerInner {
    }
 }

-#[async_trait::async_trait]
-impl Layer for ImageLayer {
-    /// Look up given page in the file
-    async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
-            .await
-    }
-}
-
-/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-impl std::fmt::Display for ImageLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.layer_desc().short_id())
-    }
-}
-
-impl AsLayerDesc for ImageLayer {
-    fn layer_desc(&self) -> &PersistentLayerDesc {
-        &self.desc
-    }
-}
-
-impl PersistentLayer for ImageLayer {
-    fn local_path(&self) -> Option<Utf8PathBuf> {
-        self.local_path()
-    }
-
-    fn delete_resident_layer_file(&self) -> Result<()> {
-        self.delete_resident_layer_file()
-    }
-
-    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        self.info(reset)
-    }
-
-    fn access_stats(&self) -> &LayerAccessStats {
-        self.access_stats()
-    }
-}
-
-impl ImageLayer {
-    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
-        println!(
-            "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
-            self.desc.tenant_id,
-            self.desc.timeline_id,
-            self.desc.key_range.start,
-            self.desc.key_range.end,
-            self.lsn,
-            self.desc.is_incremental(),
-            self.desc.file_size
-        );
-
-        if !verbose {
-            return Ok(());
-        }
-
-        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
-        let file = &inner.file;
+impl ImageLayerInner {
+    pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
+        let file = &self.file;
        let tree_reader =
-            DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);
+            DiskBtreeReader::<_, KEY_SIZE>::new(self.index_start_blk, self.index_root_blk, file);

        tree_reader.dump().await?;

@@ -250,69 +184,36 @@ impl ImageLayer {

        Ok(())
    }
+}

-    pub(crate) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        assert!(self.desc.key_range.contains(&key));
-        assert!(lsn_range.start >= self.lsn);
-        assert!(lsn_range.end >= self.lsn);
-
-        let inner = self
-            .load(LayerAccessKind::GetValueReconstructData, ctx)
-            .await?;
-        inner
-            .get_value_reconstruct_data(key, reconstruct_state, ctx)
-            .await
-            // FIXME: makes no sense to dump paths
-            .with_context(|| format!("read {}", self.path()))
+/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+impl std::fmt::Display for ImageLayer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.layer_desc().short_id())
    }
+}

-    pub(crate) fn local_path(&self) -> Option<Utf8PathBuf> {
-        Some(self.path())
+impl AsLayerDesc for ImageLayer {
+    fn layer_desc(&self) -> &PersistentLayerDesc {
+        &self.desc
    }
+}
+
+impl ImageLayer {
+    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+        self.desc.dump();
+
+        if !verbose {
+            return Ok(());
+        }
+
+        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
+
+        inner.dump(ctx).await?;

-    pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
-        // delete underlying file
-        fs::remove_file(self.path())?;
        Ok(())
    }

-    pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        let layer_file_name = self.layer_desc().filename().file_name();
-        let lsn_start = self.layer_desc().image_layer_lsn();
-
-        HistoricLayerInfo::Image {
-            layer_file_name,
-            layer_file_size: self.desc.file_size,
-            lsn_start,
-            remote: false,
-            access_stats: self.access_stats.as_api_model(reset),
-        }
-    }
-
-    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
-        &self.access_stats
-    }
-
-    fn path_for(
-        path_or_conf: &PathOrConf,
-        timeline_id: TimelineId,
-        tenant_id: TenantId,
-        fname: &ImageFileName,
-    ) -> Utf8PathBuf {
-        match path_or_conf {
-            PathOrConf::Path(path) => path.to_path_buf(),
-            PathOrConf::Conf(conf) => conf
-                .timeline_path(&tenant_id, &timeline_id)
-                .join(fname.to_string()),
-        }
-    }
-
    fn temp_path_for(
        conf: &PageServerConf,
        timeline_id: TimelineId,
@@ -348,54 +249,21 @@ impl ImageLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
        let path = self.path();

-        let expected_summary = match &self.path_or_conf {
-            PathOrConf::Conf(_) => Some(Summary::from(self)),
-            PathOrConf::Path(_) => None,
-        };
+        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx).await?;

-        let loaded =
-            ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary, ctx)
-                .await?;
+        // not production code
+        let actual_filename = path.file_name().unwrap().to_owned();
+        let expected_filename = self.layer_desc().filename().file_name();

-        if let PathOrConf::Path(ref path) = self.path_or_conf {
-            // not production code
-            let actual_filename = path.file_name().unwrap().to_owned();
-            let expected_filename = self.filename().file_name();
-
-            if actual_filename != expected_filename {
-                println!("warning: filename does not match what is expected from in-file summary");
-                println!("actual: {:?}", actual_filename);
-                println!("expected: {:?}", expected_filename);
-            }
+        if actual_filename != expected_filename {
+            println!("warning: filename does not match what is expected from in-file summary");
+            println!("actual: {:?}", actual_filename);
+            println!("expected: {:?}", expected_filename);
        }

        Ok(loaded)
    }

-    /// Create an ImageLayer struct representing an existing file on disk
-    pub fn new(
-        conf: &'static PageServerConf,
-        timeline_id: TimelineId,
-        tenant_id: TenantId,
-        filename: &ImageFileName,
-        file_size: u64,
-        access_stats: LayerAccessStats,
-    ) -> ImageLayer {
-        ImageLayer {
-            path_or_conf: PathOrConf::Conf(conf),
-            desc: PersistentLayerDesc::new_img(
-                tenant_id,
-                timeline_id,
-                filename.key_range.clone(),
-                filename.lsn,
-                file_size,
-            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
-            lsn: filename.lsn,
-            access_stats,
-            inner: OnceCell::new(),
-        }
-    }
-
    /// Create an ImageLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
@@ -407,7 +275,7 @@ impl ImageLayer {
            .metadata()
            .context("get file metadata to determine size")?;
        Ok(ImageLayer {
-            path_or_conf: PathOrConf::Path(path.to_path_buf()),
+            path: path.to_path_buf(),
            desc: PersistentLayerDesc::new_img(
                summary.tenant_id,
                summary.timeline_id,
@@ -421,18 +289,8 @@ impl ImageLayer {
        })
    }

-    fn layer_name(&self) -> ImageFileName {
-        self.desc.image_file_name()
-    }
-
-    /// Path to the layer file in pageserver workdir.
-    pub fn path(&self) -> Utf8PathBuf {
-        Self::path_for(
-            &self.path_or_conf,
-            self.desc.timeline_id,
-            self.desc.tenant_id,
-            &self.layer_name(),
-        )
+    fn path(&self) -> Utf8PathBuf {
+        self.path.clone()
    }
 }

@@ -604,7 +462,7 @@ impl ImageLayerWriterInner {
    ///
    /// Finish writing the image layer.
    ///
-    async fn finish(self) -> anyhow::Result<ImageLayer> {
+    async fn finish(self, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -658,33 +516,14 @@ impl ImageLayerWriterInner {
        // Note: Because we open the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
-        let layer = ImageLayer {
-            path_or_conf: PathOrConf::Conf(self.conf),
-            desc,
-            lsn: self.lsn,
-            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: OnceCell::new(),
-        };

        // fsync the file
        file.sync_all().await?;

-        // Rename the file to its final name
-        //
-        // Note: This overwrites any existing file. There shouldn't be any.
-        // FIXME: throw an error instead?
-        let final_path = ImageLayer::path_for(
-            &PathOrConf::Conf(self.conf),
-            self.timeline_id,
-            self.tenant_id,
-            &ImageFileName {
-                key_range: self.key_range.clone(),
-                lsn: self.lsn,
-            },
-        );
-        std::fs::rename(self.path, final_path)?;
+        // FIXME: why not carry the virtualfile here, it supports renaming?
+        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;

-        trace!("created image layer {}", layer.path());
+        trace!("created image layer {}", layer.local_path());

        Ok(layer)
    }
@@ -746,8 +585,11 @@ impl ImageLayerWriter {
    ///
    /// Finish writing the image layer.
    ///
-    pub async fn finish(mut self) -> anyhow::Result<ImageLayer> {
-        self.inner.take().unwrap().finish().await
+    pub(crate) async fn finish(
+        mut self,
+        timeline: &Arc<Timeline>,
+    ) -> anyhow::Result<super::ResidentLayer> {
+        self.inner.take().unwrap().finish(timeline).await
    }
 }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -10,11 +10,12 @@ use crate::repository::{Key, Value};
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
+use crate::tenant::Timeline;
 use crate::walrecord;
 use anyhow::{ensure, Result};
 use pageserver_api::models::InMemoryLayerInfo;
 use std::collections::HashMap;
-use std::sync::OnceLock;
+use std::sync::{Arc, OnceLock};
 use tracing::*;
 use utils::{
    bin_ser::BeSer,
@@ -28,7 +29,7 @@ use std::fmt::Write as _;
 use std::ops::Range;
 use tokio::sync::RwLock;

-use super::{DeltaLayer, DeltaLayerWriter, Layer};
+use super::{DeltaLayerWriter, ResidentLayer};

 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
@@ -207,20 +208,6 @@ impl InMemoryLayer {
    }
 }

-#[async_trait::async_trait]
-impl Layer for InMemoryLayer {
-    async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_data: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> Result<ValueReconstructResult> {
-        self.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
-            .await
-    }
-}
-
 impl std::fmt::Display for InMemoryLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let end_lsn = self.end_lsn_or_max();
@@ -229,17 +216,13 @@ impl std::fmt::Display for InMemoryLayer {
 }

 impl InMemoryLayer {
-    ///
    /// Get layer size.
-    ///
    pub async fn size(&self) -> Result<u64> {
        let inner = self.inner.read().await;
        Ok(inner.file.len())
    }

-    ///
    /// Create a new, empty, in-memory layer
-    ///
    pub async fn create(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
@@ -331,7 +314,11 @@ impl InMemoryLayer {
    /// Write this frozen in-memory layer to disk.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
-    pub(crate) async fn write_to_disk(&self, ctx: &RequestContext) -> Result<DeltaLayer> {
+    pub(crate) async fn write_to_disk(
+        &self,
+        timeline: &Arc<Timeline>,
+        ctx: &RequestContext,
+    ) -> Result<ResidentLayer> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -358,14 +345,19 @@ impl InMemoryLayer {

        let cursor = inner.file.block_cursor();

-        let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
-        keys.sort_by_key(|k| k.0);
+        // Sort the keys because delta layer writer expects them sorted.
+        //
+        // NOTE: this sort can take up significant time if the layer has millions of
+        //       keys. To speed up all the comparisons we convert the key to i128 and
+        //       keep the value as a reference.
+        let mut keys: Vec<_> = inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect();
+        keys.sort_unstable_by_key(|k| k.0);

        let ctx = RequestContextBuilder::extend(ctx)
            .page_content_kind(PageContentKind::InMemoryLayer)
            .build();
        for (key, vec_map) in keys.iter() {
-            let key = **key;
+            let key = Key::from_i128(*key);
            // Write all page versions
            for (lsn, pos) in vec_map.as_slice() {
                cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
@@ -376,7 +368,8 @@ impl InMemoryLayer {
            }
        }

-        let delta_layer = delta_layer_writer.finish(Key::MAX).await?;
+        // MAX is used here because we identify L0 layers by full key range
+        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
        Ok(delta_layer)
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -1,4 +1,3 @@
-use anyhow::Result;
 use core::fmt::Display;
 use std::ops::Range;
 use utils::{
@@ -6,7 +5,7 @@ use utils::{
    lsn::Lsn,
 };

-use crate::{context::RequestContext, repository::Key};
+use crate::repository::Key;

 use super::{DeltaFileName, ImageFileName, LayerFileName};

@@ -100,6 +99,22 @@ impl PersistentLayerDesc {
        }
    }

+    pub fn from_filename(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        filename: LayerFileName,
+        file_size: u64,
+    ) -> Self {
+        match filename {
+            LayerFileName::Image(i) => {
+                Self::new_img(tenant_id, timeline_id, i.key_range, i.lsn, file_size)
+            }
+            LayerFileName::Delta(d) => {
+                Self::new_delta(tenant_id, timeline_id, d.key_range, d.lsn_range, file_size)
+            }
+        }
+    }
+
    /// Get the LSN that the image layer covers.
    pub fn image_layer_lsn(&self) -> Lsn {
        assert!(!self.is_delta);
@@ -173,21 +188,31 @@ impl PersistentLayerDesc {
        self.is_delta
    }

-    pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
-        println!(
-            "----- layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
-            self.tenant_id,
-            self.timeline_id,
-            self.key_range.start,
-            self.key_range.end,
-            self.lsn_range.start,
-            self.lsn_range.end,
-            self.is_delta,
-            self.is_incremental(),
-            self.file_size,
-        );
-
-        Ok(())
+    pub fn dump(&self) {
+        if self.is_delta {
+            println!(
+                "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} is_incremental {} size {} ----",
+                self.tenant_id,
+                self.timeline_id,
+                self.key_range.start,
+                self.key_range.end,
+                self.lsn_range.start,
+                self.lsn_range.end,
+                self.is_incremental(),
+                self.file_size,
+            );
+        } else {
+            println!(
+                "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
+                self.tenant_id,
+                self.timeline_id,
+                self.key_range.start,
+                self.key_range.end,
+                self.image_layer_lsn(),
+                self.is_incremental(),
+                self.file_size
+            );
+        }
    }

    pub fn file_size(&self) -> u64 {
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -1,216 +0,0 @@
-//! A RemoteLayer is an in-memory placeholder for a layer file that exists
-//! in remote storage.
-//!
-use crate::config::PageServerConf;
-use crate::context::RequestContext;
-use crate::repository::Key;
-use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
-use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
-use crate::tenant::timeline::layer_manager::LayerManager;
-use anyhow::{bail, Result};
-use camino::Utf8PathBuf;
-use pageserver_api::models::HistoricLayerInfo;
-use std::ops::Range;
-use std::sync::Arc;
-
-use utils::{
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
-
-use super::filename::{DeltaFileName, ImageFileName};
-use super::{
-    AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset,
-    LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
-};
-
-/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
-/// [`DeltaLayer`](super::DeltaLayer).
-///
-/// RemoteLayer might be downloaded on-demand during operations which are
-/// allowed download remote layers and during which, it gets replaced with a
-/// concrete `DeltaLayer` or `ImageLayer`.
-///
-/// See: [`crate::context::RequestContext`] for authorization to download
-pub struct RemoteLayer {
-    pub desc: PersistentLayerDesc,
-
-    pub layer_metadata: LayerFileMetadata,
-
-    access_stats: LayerAccessStats,
-
-    pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
-
-    /// Has `LayerMap::replace` failed for this (true) or not (false).
-    ///
-    /// Used together with [`ongoing_download`] semaphore in `Timeline::download_remote_layer`.
-    /// The field is used to mark a RemoteLayer permanently (until restart or ignore+load)
-    /// unprocessable, because a LayerMap::replace failed.
-    ///
-    /// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
-    /// a possible fast loop between `Timeline::get_reconstruct_data` and
-    /// `Timeline::download_remote_layer`, which also logs.
-    ///
-    /// [`ongoing_download`]: Self::ongoing_download
-    pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
-}
-
-impl std::fmt::Debug for RemoteLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("RemoteLayer")
-            .field("file_name", &self.desc.filename())
-            .field("layer_metadata", &self.layer_metadata)
-            .field("is_incremental", &self.desc.is_incremental())
-            .finish()
-    }
-}
-
-#[async_trait::async_trait]
-impl Layer for RemoteLayer {
-    async fn get_value_reconstruct_data(
-        &self,
-        _key: Key,
-        _lsn_range: Range<Lsn>,
-        _reconstruct_state: &mut ValueReconstructState,
-        _ctx: &RequestContext,
-    ) -> Result<ValueReconstructResult> {
-        Err(anyhow::anyhow!("layer {self} needs to be downloaded"))
-    }
-}
-
-/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-impl std::fmt::Display for RemoteLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.layer_desc().short_id())
-    }
-}
-
-impl AsLayerDesc for RemoteLayer {
-    fn layer_desc(&self) -> &PersistentLayerDesc {
-        &self.desc
-    }
-}
-
-impl PersistentLayer for RemoteLayer {
-    fn local_path(&self) -> Option<Utf8PathBuf> {
-        None
-    }
-
-    fn delete_resident_layer_file(&self) -> Result<()> {
-        bail!("remote layer has no layer file");
-    }
-
-    fn downcast_remote_layer<'a>(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
-        Some(self)
-    }
-
-    fn is_remote_layer(&self) -> bool {
-        true
-    }
-
-    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        let layer_file_name = self.layer_desc().filename().file_name();
-        let lsn_range = self.layer_desc().lsn_range.clone();
-
-        if self.desc.is_delta {
-            HistoricLayerInfo::Delta {
-                layer_file_name,
-                layer_file_size: self.layer_metadata.file_size(),
-                lsn_start: lsn_range.start,
-                lsn_end: lsn_range.end,
-                remote: true,
-                access_stats: self.access_stats.as_api_model(reset),
-            }
-        } else {
-            HistoricLayerInfo::Image {
-                layer_file_name,
-                layer_file_size: self.layer_metadata.file_size(),
-                lsn_start: lsn_range.start,
-                remote: true,
-                access_stats: self.access_stats.as_api_model(reset),
-            }
-        }
-    }
-
-    fn access_stats(&self) -> &LayerAccessStats {
-        &self.access_stats
-    }
-}
-
-impl RemoteLayer {
-    pub fn new_img(
-        tenantid: TenantId,
-        timelineid: TimelineId,
-        fname: &ImageFileName,
-        layer_metadata: &LayerFileMetadata,
-        access_stats: LayerAccessStats,
-    ) -> RemoteLayer {
-        RemoteLayer {
-            desc: PersistentLayerDesc::new_img(
-                tenantid,
-                timelineid,
-                fname.key_range.clone(),
-                fname.lsn,
-                layer_metadata.file_size(),
-            ),
-            layer_metadata: layer_metadata.clone(),
-            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
-            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
-            access_stats,
-        }
-    }
-
-    pub fn new_delta(
-        tenantid: TenantId,
-        timelineid: TimelineId,
-        fname: &DeltaFileName,
-        layer_metadata: &LayerFileMetadata,
-        access_stats: LayerAccessStats,
-    ) -> RemoteLayer {
-        RemoteLayer {
-            desc: PersistentLayerDesc::new_delta(
-                tenantid,
-                timelineid,
-                fname.key_range.clone(),
-                fname.lsn_range.clone(),
-                layer_metadata.file_size(),
-            ),
-            layer_metadata: layer_metadata.clone(),
-            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
-            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
-            access_stats,
-        }
-    }
-
-    /// Create a Layer struct representing this layer, after it has been downloaded.
-    pub(crate) fn create_downloaded_layer(
-        &self,
-        _layer_map_lock_held_witness: &LayerManager,
-        conf: &'static PageServerConf,
-        file_size: u64,
-    ) -> Arc<dyn PersistentLayer> {
-        if self.desc.is_delta {
-            let fname = self.desc.delta_file_name();
-            Arc::new(DeltaLayer::new(
-                conf,
-                self.desc.timeline_id,
-                self.desc.tenant_id,
-                &fname,
-                file_size,
-                self.access_stats
-                    .clone_for_residence_change(LayerResidenceStatus::Resident),
-            ))
-        } else {
-            let fname = self.desc.image_file_name();
-            Arc::new(ImageLayer::new(
-                conf,
-                self.desc.timeline_id,
-                self.desc.tenant_id,
-                &fname,
-                file_size,
-                self.access_stats
-                    .clone_for_residence_change(LayerResidenceStatus::Resident),
-            ))
-        }
-    }
-}
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -12,7 +12,74 @@ use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::{Tenant, TenantState};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::completion;
+use utils::{backoff, completion};
+
+static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
+    once_cell::sync::Lazy::new(|| {
+        let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
+        let permits = usize::max(
+            1,
+            // while a lot of the work is done on spawn_blocking, we still do
+            // repartitioning in the async context. this should give leave us some workers
+            // unblocked to be blocked on other work, hopefully easing any outside visible
+            // effects of restarts.
+            //
+            // 6/8 is a guess; previously we ran with unlimited 8 and more from
+            // spawn_blocking.
+            (total_threads * 3).checked_div(4).unwrap_or(0),
+        );
+        assert_ne!(permits, 0, "we will not be adding in permits later");
+        assert!(
+            permits < total_threads,
+            "need threads avail for shorter work"
+        );
+        tokio::sync::Semaphore::new(permits)
+    });
+
+#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr)]
+#[strum(serialize_all = "snake_case")]
+pub(crate) enum BackgroundLoopKind {
+    Compaction,
+    Gc,
+    Eviction,
+    ConsumptionMetricsCollectMetrics,
+    ConsumptionMetricsSyntheticSizeWorker,
+}
+
+impl BackgroundLoopKind {
+    fn as_static_str(&self) -> &'static str {
+        let s: &'static str = self.into();
+        s
+    }
+}
+
+pub(crate) enum RateLimitError {
+    Cancelled,
+}
+
+pub(crate) async fn concurrent_background_tasks_rate_limit(
+    loop_kind: BackgroundLoopKind,
+    _ctx: &RequestContext,
+    cancel: &CancellationToken,
+) -> Result<impl Drop, RateLimitError> {
+    crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT
+        .with_label_values(&[loop_kind.as_static_str()])
+        .inc();
+    scopeguard::defer!(
+        crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT.with_label_values(&[loop_kind.as_static_str()]).inc();
+    );
+    tokio::select! {
+        permit = CONCURRENT_BACKGROUND_TASKS.acquire() => {
+            match permit {
+                Ok(permit) => Ok(permit),
+                Err(_closed) => unreachable!("we never close the semaphore"),
+            }
+        },
+        _ = cancel.cancelled() => {
+            Err(RateLimitError::Cancelled)
+        }
+    }
+}

 /// Start per tenant background loops: compaction and gc.
 pub fn start_background_loops(
@@ -72,7 +139,10 @@ pub fn start_background_loops(
 /// Compaction task's main loop
 ///
 async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
-    let wait_duration = Duration::from_secs(2);
+    const MAX_BACKOFF_SECS: f64 = 300.0;
+    // How many errors we have seen consequtively
+    let mut error_run_count = 0;
+
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
        let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
@@ -109,14 +179,24 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            } else {
                // Run compaction
                if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
-                    error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration);
-                    wait_duration
+                    let wait_duration = backoff::exponential_backoff_duration_seconds(
+                        error_run_count,
+                        1.0,
+                        MAX_BACKOFF_SECS,
+                    );
+                    error_run_count += 1;
+                    error!(
+                        "Compaction failed {error_run_count} times, retrying in {:?}: {e:?}",
+                        wait_duration
+                    );
+                    Duration::from_secs_f64(wait_duration)
                } else {
+                    error_run_count = 0;
                    period
                }
            };

-            warn_when_period_overrun(started_at.elapsed(), period, "compaction");
+            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);

            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -135,7 +215,10 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 /// GC task's main loop
 ///
 async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
-    let wait_duration = Duration::from_secs(2);
+    const MAX_BACKOFF_SECS: f64 = 300.0;
+    // How many errors we have seen consequtively
+    let mut error_run_count = 0;
+
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
        // GC might require downloading, to find the cutoff LSN that corresponds to the
@@ -177,14 +260,24 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                    .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx)
                    .await;
                if let Err(e) = res {
-                    error!("Gc failed, retrying in {:?}: {e:?}", wait_duration);
-                    wait_duration
+                    let wait_duration = backoff::exponential_backoff_duration_seconds(
+                        error_run_count,
+                        1.0,
+                        MAX_BACKOFF_SECS,
+                    );
+                    error_run_count += 1;
+                    error!(
+                        "Gc failed {error_run_count} times, retrying in {:?}: {e:?}",
+                        wait_duration
+                    );
+                    Duration::from_secs_f64(wait_duration)
                } else {
+                    error_run_count = 0;
                    period
                }
            };

-            warn_when_period_overrun(started_at.elapsed(), period, "gc");
+            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);

            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -258,20 +351,24 @@ pub(crate) async fn random_init_delay(
 }

 /// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
-pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) {
+pub(crate) fn warn_when_period_overrun(
+    elapsed: Duration,
+    period: Duration,
+    task: BackgroundLoopKind,
+) {
    // Duration::ZERO will happen because it's the "disable [bgtask]" value.
    if elapsed >= period && period != Duration::ZERO {
        // humantime does no significant digits clamping whereas Duration's debug is a bit more
        // intelligent. however it makes sense to keep the "configuration format" for period, even
        // though there's no way to output the actual config value.
-        warn!(
+        info!(
            ?elapsed,
            period = %humantime::format_duration(period),
-            task,
+            ?task,
            "task iteration took longer than the configured period"
        );
        crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
-            .with_label_values(&[task, &format!("{}", period.as_secs())])
+            .with_label_values(&[task.as_static_str(), &format!("{}", period.as_secs())])
            .inc();
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -38,6 +38,14 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
    }
    debug!("wal receiver shutdown confirmed");

+    // Shut down the layer flush task before the remote client, as one depends on the other
+    task_mgr::shutdown_tasks(
+        Some(TaskKind::LayerFlushTask),
+        Some(timeline.tenant_id),
+        Some(timeline.timeline_id),
+    )
+    .await;
+
    // Prevent new uploads from starting.
    if let Some(remote_client) = timeline.remote_client.as_ref() {
        let res = remote_client.stop();
@@ -294,6 +302,7 @@ async fn cleanup_remaining_timeline_fs_traces(
    // Remove delete mark
    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
        .await
+        .or_else(fs_ext::ignore_not_found)
        .context("remove delete mark")
 }

--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -29,7 +29,7 @@ use crate::{
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
-        storage_layer::PersistentLayer,
+        tasks::{BackgroundLoopKind, RateLimitError},
        timeline::EvictionError,
        LogicalSizeCalculationCause, Tenant,
    },
@@ -129,7 +129,11 @@ impl Timeline {
                    ControlFlow::Continue(()) => (),
                }
                let elapsed = start.elapsed();
-                crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction");
+                crate::tenant::tasks::warn_when_period_overrun(
+                    elapsed,
+                    p.period,
+                    BackgroundLoopKind::Eviction,
+                );
                crate::metrics::EVICTION_ITERATION_DURATION
                    .get_metric_with_label_values(&[
                        &format!("{}", p.period.as_secs()),
@@ -150,6 +154,17 @@ impl Timeline {
    ) -> ControlFlow<()> {
        let now = SystemTime::now();

+        let _permit = match crate::tenant::tasks::concurrent_background_tasks_rate_limit(
+            BackgroundLoopKind::Eviction,
+            ctx,
+            cancel,
+        )
+        .await
+        {
+            Ok(permit) => permit,
+            Err(RateLimitError::Cancelled) => return ControlFlow::Break(()),
+        };
+
        // If we evict layers but keep cached values derived from those layers, then
        // we face a storm of on-demand downloads after pageserver restart.
        // The reason is that the restart empties the caches, and so, the values
@@ -194,15 +209,26 @@ impl Timeline {
        // NB: all the checks can be invalidated as soon as we release the layer map lock.
        // We don't want to hold the layer map lock during eviction.
        // So, we just need to deal with this.
-        let candidates: Vec<Arc<dyn PersistentLayer>> = {
+        let candidates: Vec<_> = {
            let guard = self.layers.read().await;
            let layers = guard.layer_map();
            let mut candidates = Vec::new();
            for hist_layer in layers.iter_historic_layers() {
                let hist_layer = guard.get_from_desc(&hist_layer);
-                if hist_layer.is_remote_layer() {
-                    continue;
-                }
+
+                // guard against eviction while we inspect it; it might be that eviction_task and
+                // disk_usage_eviction_task both select the same layers to be evicted, and
+                // seemingly free up double the space. both succeeding is of no consequence.
+                let guard = match hist_layer.keep_resident().await {
+                    Ok(Some(l)) => l,
+                    Ok(None) => continue,
+                    Err(e) => {
+                        // these should not happen, but we cannot make them statically impossible right
+                        // now.
+                        tracing::warn!(layer=%hist_layer, "failed to keep the layer resident: {e:#}");
+                        continue;
+                    }
+                };

                let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| {
                    // We only use this fallback if there's an implementation error.
@@ -233,7 +259,7 @@ impl Timeline {
                    }
                };
                if no_activity_for > p.threshold {
-                    candidates.push(hist_layer)
+                    candidates.push(guard.drop_eviction_guard())
                }
            }
            candidates
@@ -252,7 +278,7 @@ impl Timeline {
        };

        let results = match self
-            .evict_layer_batch(remote_client, &candidates[..], cancel.clone())
+            .evict_layer_batch(remote_client, &candidates, cancel)
            .await
        {
            Err(pre_err) => {
@@ -263,7 +289,7 @@ impl Timeline {
            Ok(results) => results,
        };
        assert_eq!(results.len(), candidates.len());
-        for (l, result) in candidates.iter().zip(results) {
+        for result in results {
            match result {
                None => {
                    stats.skipped_for_shutdown += 1;
@@ -271,20 +297,10 @@ impl Timeline {
                Some(Ok(())) => {
                    stats.evicted += 1;
                }
-                Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
-                    stats.not_evictable += 1;
-                }
-                Some(Err(EvictionError::FileNotFound)) => {
+                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
                    // compaction/gc removed the file while we were waiting on layer_removal_cs
                    stats.not_evictable += 1;
                }
-                Some(Err(
-                    e @ EvictionError::LayerNotFound(_) | e @ EvictionError::StatFailed(_),
-                )) => {
-                    let e = utils::error::report_compact_sources(&e);
-                    warn!(layer = %l, "failed to evict layer: {e}");
-                    stats.not_evictable += 1;
-                }
            }
        }
        if stats.candidates == stats.not_evictable {
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -72,7 +72,7 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
 }

 /// Decision on what to do with a layer file after considering its local and remote metadata.
-#[derive(Clone)]
+#[derive(Clone, Debug)]
 pub(super) enum Decision {
    /// The layer is not present locally.
    Evicted(LayerFileMetadata),
@@ -84,27 +84,30 @@ pub(super) enum Decision {
    },
    /// The layer is present locally, and metadata matches.
    UseLocal(LayerFileMetadata),
-    /// The layer is only known locally, it needs to be uploaded.
-    NeedsUpload(LayerFileMetadata),
 }

-/// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded.
+/// A layer needs to be left out of the layer map.
 #[derive(Debug)]
-pub(super) struct FutureLayer {
-    /// The local metadata. `None` if the layer is only known through [`IndexPart`].
-    pub(super) local: Option<LayerFileMetadata>,
+pub(super) enum DismissedLayer {
+    /// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded.
+    Future {
+        /// The local metadata. `None` if the layer is only known through [`IndexPart`].
+        local: Option<LayerFileMetadata>,
+    },
+    /// The layer only exists locally.
+    ///
+    /// In order to make crash safe updates to layer map, we must dismiss layers which are only
+    /// found locally or not yet included in the remote `index_part.json`.
+    LocalOnly(LayerFileMetadata),
 }

 /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
-///
-/// This function should not gain additional reasons to fail than [`FutureLayer`], consider adding
-/// the checks earlier to [`scan_timeline_dir`].
 pub(super) fn reconcile(
    discovered: Vec<(LayerFileName, u64)>,
    index_part: Option<&IndexPart>,
    disk_consistent_lsn: Lsn,
    generation: Generation,
-) -> Vec<(LayerFileName, Result<Decision, FutureLayer>)> {
+) -> Vec<(LayerFileName, Result<Decision, DismissedLayer>)> {
    use Decision::*;

    // name => (local, remote)
@@ -142,17 +145,19 @@ pub(super) fn reconcile(
        .into_iter()
        .map(|(name, (local, remote))| {
            let decision = if name.is_in_future(disk_consistent_lsn) {
-                Err(FutureLayer { local })
+                Err(DismissedLayer::Future { local })
            } else {
-                Ok(match (local, remote) {
-                    (Some(local), Some(remote)) if local != remote => UseRemote { local, remote },
-                    (Some(x), Some(_)) => UseLocal(x),
-                    (None, Some(x)) => Evicted(x),
-                    (Some(x), None) => NeedsUpload(x),
+                match (local, remote) {
+                    (Some(local), Some(remote)) if local != remote => {
+                        Ok(UseRemote { local, remote })
+                    }
+                    (Some(x), Some(_)) => Ok(UseLocal(x)),
+                    (None, Some(x)) => Ok(Evicted(x)),
+                    (Some(x), None) => Err(DismissedLayer::LocalOnly(x)),
                    (None, None) => {
                        unreachable!("there must not be any non-local non-remote files")
                    }
-                })
+                }
            };

            (name, decision)
@@ -192,14 +197,21 @@ pub(super) fn cleanup_future_layer(
    name: &LayerFileName,
    disk_consistent_lsn: Lsn,
 ) -> anyhow::Result<()> {
-    use LayerFileName::*;
-    let kind = match name {
-        Delta(_) => "delta",
-        Image(_) => "image",
-    };
    // future image layers are allowed to be produced always for not yet flushed to disk
    // lsns stored in InMemoryLayer.
+    let kind = name.kind();
    tracing::info!("found future {kind} layer {name} disk_consistent_lsn is {disk_consistent_lsn}");
-    crate::tenant::timeline::rename_to_backup(path)?;
+    std::fs::remove_file(path)?;
+    Ok(())
+}
+
+pub(super) fn cleanup_local_only_file(
+    path: &Utf8Path,
+    name: &LayerFileName,
+    local: &LayerFileMetadata,
+) -> anyhow::Result<()> {
+    let kind = name.kind();
+    tracing::info!("found local-only {kind} layer {name}, metadata {local:?}");
+    std::fs::remove_file(path)?;
    Ok(())
 }
--- a/Show More
+++ b/Show More