Run pgindent on walproposer.

add walproposer_c top comment
Add comments
2026-02-05 19:50:36 +00:00 · 2023-09-29 13:16:48 +03:00 · 2023-09-29 12:32:25 +03:00 · 2023-09-27 13:00:27 +00:00 · 2023-09-27 13:00:27 +00:00 · 2023-09-27 13:00:27 +00:00
125 changed files with 9953 additions and 4353 deletions
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -16,21 +16,29 @@ on:
      # Actual magic happens here:
      - labeled

+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
+
 env:
  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
  BRANCH: "ci-run/pr-${{ github.event.pull_request.number }}"

-permissions: write-all
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}

-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
+defaults:
+  run:
+    shell: bash -euo pipefail {0}

 jobs:
  remove-label:
    # Remove `approved-for-ci-run` label if the workflow is triggered by changes in a PR.
    # The PR should be reviewed and labelled manually again.

+    permissions:
+      pull-requests: write # For `gh pr edit`
+
    if: |
      contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
@@ -43,6 +51,10 @@ jobs:
  create-or-update-pr-for-ci-run:
    # Create local PR for an `approved-for-ci-run` labelled PR to run CI pipeline in it.

+    permissions:
+      pull-requests: write # for `gh pr edit`
+      # For `git push` and `gh pr create` we use CI_ACCESS_TOKEN
+
    if: |
      github.event.action == 'labeled' &&
      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
@@ -75,7 +87,7 @@ jobs:
            Feel free to review/comment/discuss the original PR #${PR_NUMBER}.
          EOF

-          ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${HEAD} --base main --json number --jq '.[].number')"
+          ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')"
          if [ -z "${ALREADY_CREATED}" ]; then
            gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \
                                                       --body-file "body.md" \
@@ -87,6 +99,10 @@ jobs:
  cleanup:
    # Close PRs and delete branchs if the original PR is closed.

+    permissions:
+      contents: write # for `--delete-branch` flag in `gh pr close`
+      pull-requests: write # for `gh pr close`
+
    if: |
      github.event.action == 'closed' &&
      github.event.pull_request.head.repo.full_name != github.repository
@@ -94,8 +110,9 @@ jobs:
    runs-on: ubuntu-latest

    steps:
-      - run: |
-          CLOSED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${HEAD} --json 'closed' --jq '.[].closed')"
+      - name: Close PR and delete `ci-run/pr-${{ env.PR_NUMBER }}` branch
+        run: |
+          CLOSED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --json 'closed' --jq '.[].closed')"
          if [ "${CLOSED}" == "false" ]; then
            gh pr --repo "${GITHUB_REPOSITORY}" close "${BRANCH}" --delete-branch
          fi
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -834,7 +834,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.17.5
+      VM_BUILDER_VERSION: v0.17.11

    steps:
      - name: Checkout
@@ -1091,8 +1091,9 @@ jobs:
          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}}
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -636,7 +636,7 @@ dependencies = [
 "sha1",
 "sync_wrapper",
 "tokio",
- "tokio-tungstenite 0.20.0",
+ "tokio-tungstenite",
 "tower",
 "tower-layer",
 "tower-service",
@@ -1941,15 +1941,15 @@ dependencies = [

 [[package]]
 name = "hyper-tungstenite"
-version = "0.9.0"
+version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "880b8b1c98a5ec2a505c7c90db6d3f6f1f480af5655d9c5b55facc9382a5a5b5"
+checksum = "7cc7dcb1ab67cd336f468a12491765672e61a3b6b148634dbfe2fe8acd3fe7d9"
 dependencies = [
 "hyper",
- "pin-project",
+ "pin-project-lite",
 "tokio",
- "tokio-tungstenite 0.18.0",
- "tungstenite 0.18.0",
+ "tokio-tungstenite",
+ "tungstenite",
 ]

 [[package]]
@@ -2908,9 +2908,9 @@ dependencies = [

 [[package]]
 name = "pin-project-lite"
-version = "0.2.9"
+version = "0.2.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116"
+checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58"

 [[package]]
 name = "pin-utils"
@@ -3417,6 +3417,7 @@ dependencies = [
 "metrics",
 "once_cell",
 "pin-project-lite",
+ "rand",
 "scopeguard",
 "serde",
 "serde_json",
@@ -4641,18 +4642,6 @@ dependencies = [
 "xattr",
 ]

-[[package]]
-name = "tokio-tungstenite"
-version = "0.18.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54319c93411147bced34cb5609a80e0a8e44c5999c93903a81cd866630ec0bfd"
-dependencies = [
- "futures-util",
- "log",
- "tokio",
- "tungstenite 0.18.0",
-]
-
 [[package]]
 name = "tokio-tungstenite"
 version = "0.20.0"
@@ -4662,7 +4651,7 @@ dependencies = [
 "futures-util",
 "log",
 "tokio",
- "tungstenite 0.20.0",
+ "tungstenite",
 ]

 [[package]]
@@ -4977,28 +4966,9 @@ checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed"

 [[package]]
 name = "tungstenite"
-version = "0.18.0"
+version = "0.20.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "30ee6ab729cd4cf0fd55218530c4522ed30b7b6081752839b68fcec8d0960788"
-dependencies = [
- "base64 0.13.1",
- "byteorder",
- "bytes",
- "http",
- "httparse",
- "log",
- "rand",
- "sha1",
- "thiserror",
- "url",
- "utf-8",
-]
-
-[[package]]
-name = "tungstenite"
-version = "0.20.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e862a1c4128df0112ab625f55cd5c934bcb4312ba80b39ae4b4835a3fd58e649"
+checksum = "9e3dac10fd62eaf6617d3a904ae222845979aec67c615d1c842b4002c7666fb9"
 dependencies = [
 "byteorder",
 "bytes",
@@ -5648,6 +5618,7 @@ dependencies = [
 "tower",
 "tracing",
 "tracing-core",
+ "tungstenite",
 "url",
 "uuid",
 ]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -78,7 +78,7 @@ hostname = "0.3.1"
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
-hyper-tungstenite = "0.9"
+hyper-tungstenite = "0.11"
 inotify = "0.10.2"
 itertools = "0.10"
 jsonwebtoken = "8"
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -124,8 +124,21 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
    apt install -y ninja-build python3-dev libncurses5 binutils clang

-RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.8.tar.gz -O plv8.tar.gz && \
-    echo "92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 plv8.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export PLV8_VERSION=3.1.5 \
+        export PLV8_CHECKSUM=1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 \
+        ;; \
+      "v16") \
+        export PLV8_VERSION=3.1.8 \
+        export PLV8_CHECKSUM=92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/plv8/plv8/archive/refs/tags/v${PLV8_VERSION}.tar.gz -O plv8.tar.gz && \
+    echo "${PLV8_CHECKSUM} plv8.tar.gz" | sha256sum --check && \
    mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -416,7 +429,7 @@ RUN case "${PG_VERSION}" in \
        ;; \
      "v16") \
        export PG_HINT_PLAN_VERSION=16_1_6_0 \
-        export PG_HINT_PLAN_CHECKSUM=ce6a8040c78012000f5da7240caf6a971401412f41d33f930f09291e6c304b99 \
+        export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \
        ;; \
      *) \
        echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \
@@ -566,8 +579,8 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in \
      "v14" | "v15") \
-        export PG_EMBEDDING_VERSION=0.3.6 \
-        export PG_EMBEDDING_CHECKSUM=b2e2b359335d26987778c7fae0c9bcc8ebc3530fc214113be1ddbc8a136e52ac \
+        export PG_EMBEDDING_VERSION=0.3.5 \
+        export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \
        ;; \
      *) \
        echo "pg_embedding not supported on this PostgreSQL version. Use pgvector instead." && exit 0;; \
@@ -576,8 +589,7 @@ RUN case "${PG_VERSION}" in \
    echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \
    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/embedding.control
+    make -j $(getconf _NPROCESSORS_ONLN) install

 #########################################################################################
 #
--- a/15
+++ b/15
@@ -153,18 +153,6 @@ neon-pg-ext-%: postgres-%
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install

-# pg_embedding was temporarily released as hnsw from this repo, when we only
-# supported PostgreSQL 14 and 15
-neon-pg-ext-v14: neon-pg-ext-hnsw-v14
-neon-pg-ext-v15: neon-pg-ext-hnsw-v15
-
-neon-pg-ext-hnsw-%: postgres-headers-% postgres-%
-	+@echo "Compiling hnsw $*"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/hnsw-$*
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
-		-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile install
-
 .PHONY: neon-pg-ext-clean-%
 neon-pg-ext-clean-%:
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
@@ -179,9 +167,6 @@ neon-pg-ext-clean-%:
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
 	-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 	-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
-	-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
-	-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile clean

 .PHONY: neon-pg-ext
 neon-pg-ext: \
--- a/README.md
+++ b/README.md
@@ -29,13 +29,13 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
 libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
-libcurl4-openssl-dev openssl python-poetry lsof
+libcurl4-openssl-dev openssl python-poetry lsof libicu-dev
 ```
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
-  protobuf-devel libcurl-devel openssl poetry lsof
+  protobuf-devel libcurl-devel openssl poetry lsof libicu-devel
 ```
 * On Arch based systems, these packages are needed:
 ```bash
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -223,6 +223,7 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
    if attach_req.pageserver_id.is_some() {
        tenant_state.generation += 1;
    }
+    tenant_state.pageserver = attach_req.pageserver_id;
    let generation = tenant_state.generation;

    locked.save().await.map_err(ApiError::InternalServerError)?;
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -3,9 +3,9 @@
 //!
 use chrono::{DateTime, Utc};
 use rand::Rng;
-use serde::Serialize;
+use serde::{Deserialize, Serialize};

-#[derive(Serialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
+#[derive(Serialize, serde::Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
 #[serde(tag = "type")]
 pub enum EventType {
    #[serde(rename = "absolute")]
@@ -27,7 +27,8 @@ impl EventType {
    }

    pub fn incremental_timerange(&self) -> Option<std::ops::Range<&DateTime<Utc>>> {
-        // these can most likely be thought of as Range or RangeFull
+        // these can most likely be thought of as Range or RangeFull, at least pageserver creates
+        // incremental ranges where the stop and next start are equal.
        use EventType::*;
        match self {
            Incremental {
@@ -41,15 +42,25 @@ impl EventType {
    pub fn is_incremental(&self) -> bool {
        matches!(self, EventType::Incremental { .. })
    }
+
+    /// Returns the absolute time, or for incremental ranges, the stop time.
+    pub fn recorded_at(&self) -> &DateTime<Utc> {
+        use EventType::*;
+
+        match self {
+            Absolute { time } => time,
+            Incremental { stop_time, .. } => stop_time,
+        }
+    }
 }

-#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
-pub struct Event<Extra> {
+#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
+pub struct Event<Extra, Metric> {
    #[serde(flatten)]
    #[serde(rename = "type")]
    pub kind: EventType,

-    pub metric: &'static str,
+    pub metric: Metric,
    pub idempotency_key: String,
    pub value: u64,

@@ -58,12 +69,38 @@ pub struct Event<Extra> {
 }

 pub fn idempotency_key(node_id: &str) -> String {
-    format!(
-        "{}-{}-{:04}",
-        Utc::now(),
-        node_id,
-        rand::thread_rng().gen_range(0..=9999)
-    )
+    IdempotencyKey::generate(node_id).to_string()
+}
+
+/// Downstream users will use these to detect upload retries.
+pub struct IdempotencyKey<'a> {
+    now: chrono::DateTime<Utc>,
+    node_id: &'a str,
+    nonce: u16,
+}
+
+impl std::fmt::Display for IdempotencyKey<'_> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}-{}-{:04}", self.now, self.node_id, self.nonce)
+    }
+}
+
+impl<'a> IdempotencyKey<'a> {
+    pub fn generate(node_id: &'a str) -> Self {
+        IdempotencyKey {
+            now: Utc::now(),
+            node_id,
+            nonce: rand::thread_rng().gen_range(0..=9999),
+        }
+    }
+
+    pub fn for_tests(now: DateTime<Utc>, node_id: &'a str, nonce: u16) -> Self {
+        IdempotencyKey {
+            now,
+            node_id,
+            nonce,
+        }
+    }
 }

 pub const CHUNK_SIZE: usize = 1000;
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -363,8 +363,15 @@ pub struct TimelineInfo {
    pub latest_gc_cutoff_lsn: Lsn,
    #[serde_as(as = "DisplayFromStr")]
    pub disk_consistent_lsn: Lsn,
+
+    /// The LSN that we have succesfully uploaded to remote storage
    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn: Lsn,
+
+    /// The LSN that we are advertizing to safekeepers
+    #[serde_as(as = "DisplayFromStr")]
+    pub remote_consistent_lsn_visible: Lsn,
+
    pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
    /// Sum of the size of all layer files.
    /// If a layer is present in both local FS and S3, it counts only once.
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -137,9 +137,12 @@ pub const XLOG_HEAP_INSERT: u8 = 0x00;
 pub const XLOG_HEAP_DELETE: u8 = 0x10;
 pub const XLOG_HEAP_UPDATE: u8 = 0x20;
 pub const XLOG_HEAP_HOT_UPDATE: u8 = 0x40;
+pub const XLOG_HEAP_LOCK: u8 = 0x60;
 pub const XLOG_HEAP_INIT_PAGE: u8 = 0x80;
 pub const XLOG_HEAP2_VISIBLE: u8 = 0x40;
 pub const XLOG_HEAP2_MULTI_INSERT: u8 = 0x50;
+pub const XLOG_HEAP2_LOCK_UPDATED: u8 = 0x60;
+pub const XLH_LOCK_ALL_FROZEN_CLEARED: u8 = 0x01;
 pub const XLH_INSERT_ALL_FROZEN_SET: u8 = (1 << 5) as u8;
 pub const XLH_INSERT_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
 pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -29,3 +29,4 @@ workspace_hack.workspace = true
 [dev-dependencies]
 tempfile.workspace = true
 test-context.workspace = true
+rand.workspace = true
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -20,6 +20,7 @@ use std::{

 use anyhow::{bail, Context};

+use serde::{Deserialize, Serialize};
 use tokio::io;
 use toml_edit::Item;
 use tracing::info;
@@ -42,6 +43,9 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
 /// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;

+/// As defined in S3 docs
+pub const MAX_KEYS_PER_DELETE: usize = 1000;
+
 const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';

 /// Path on the remote storage, relative to some inner prefix.
@@ -50,6 +54,25 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct RemotePath(PathBuf);

+impl Serialize for RemotePath {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        serializer.collect_str(self)
+    }
+}
+
+impl<'de> Deserialize<'de> for RemotePath {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        let str = String::deserialize(deserializer)?;
+        Ok(Self(PathBuf::from(&str)))
+    }
+}
+
 impl std::fmt::Display for RemotePath {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.0.display())
@@ -88,6 +111,10 @@ impl RemotePath {
    pub fn extension(&self) -> Option<&str> {
        self.0.extension()?.to_str()
    }
+
+    pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Path, std::path::StripPrefixError> {
+        self.0.strip_prefix(&p.0)
+    }
 }

 /// Storage (potentially remote) API to manage its state.
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -33,11 +33,10 @@ use tracing::debug;

 use super::StorageMetadata;
 use crate::{
-    Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Download, DownloadError, RemotePath, RemoteStorage, S3Config, MAX_KEYS_PER_DELETE,
+    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

-const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;
-
 pub(super) mod metrics;

 use self::metrics::{AttemptOutcome, RequestKind};
@@ -500,7 +499,7 @@ impl RemoteStorage for S3Bucket {
            delete_objects.push(obj_id);
        }

-        for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
+        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
            let started_at = start_measuring_requests(kind);

            let resp = self
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -378,21 +378,30 @@ impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
 fn create_s3_client(
    max_keys_per_list_response: Option<i32>,
 ) -> anyhow::Result<Arc<GenericRemoteStorage>> {
+    use rand::Rng;
+
    let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
        .context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
    let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
        .context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?;
-    let random_prefix_part = std::time::SystemTime::now()
+
+    // due to how time works, we've had test runners use the same nanos as bucket prefixes.
+    // millis is just a debugging aid for easier finding the prefix later.
+    let millis = std::time::SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .context("random s3 test prefix part calculation")?
-        .as_nanos();
+        .as_millis();
+
+    // because nanos can be the same for two threads so can millis, add randomness
+    let random = rand::thread_rng().gen::<u32>();
+
    let remote_storage_config = RemoteStorageConfig {
        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
        max_sync_errors: NonZeroU32::new(5).unwrap(),
        storage: RemoteStorageKind::AwsS3(S3Config {
            bucket_name: remote_storage_s3_bucket,
            bucket_region: remote_storage_s3_region,
-            prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
+            prefix_in_bucket: Some(format!("test_{millis}_{random:08x}/")),
            endpoint: None,
            concurrency_limit: NonZeroUsize::new(100).unwrap(),
            max_keys_per_list_response,
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -89,6 +89,22 @@ impl Generation {
            Self::Broken => panic!("Attempted to use a broken generation"),
        }
    }
+
+    pub fn next(&self) -> Generation {
+        match self {
+            Self::Valid(n) => Self::Valid(*n + 1),
+            Self::None => Self::Valid(1),
+            Self::Broken => panic!("Attempted to use a broken generation"),
+        }
+    }
+
+    pub fn into(self) -> Option<u32> {
+        if let Self::Valid(v) = self {
+            Some(v)
+        } else {
+            None
+        }
+    }
 }

 impl Serialize for Generation {
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -24,6 +24,9 @@ pub enum ApiError {
    #[error("Precondition failed: {0}")]
    PreconditionFailed(Box<str>),

+    #[error("Shutting down")]
+    ShuttingDown,
+
    #[error(transparent)]
    InternalServerError(anyhow::Error),
 }
@@ -52,6 +55,10 @@ impl ApiError {
                self.to_string(),
                StatusCode::PRECONDITION_FAILED,
            ),
+            ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status(
+                "Shutting down".to_string(),
+                StatusCode::SERVICE_UNAVAILABLE,
+            ),
            ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
                err.to_string(),
                StatusCode::INTERNAL_SERVER_ERROR,
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -216,6 +216,24 @@ impl std::fmt::Debug for PrettyLocation<'_, '_> {
    }
 }

+/// When you will store a secret but want to make sure it won't
+/// be accidentally logged, wrap it in a SecretString, whose Debug
+/// implementation does not expose the contents.
+#[derive(Clone, Eq, PartialEq)]
+pub struct SecretString(String);
+
+impl SecretString {
+    pub fn get_contents(&self) -> &str {
+        self.0.as_str()
+    }
+}
+
+impl std::fmt::Debug for SecretString {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "[SECRET]")
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use metrics::{core::Opts, IntCounterVec};
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -315,12 +315,8 @@ impl CgroupWatcher {
    where
        E: Stream<Item = Sequenced<u64>>,
    {
-        // There are several actions might do when receiving a `memory.high`,
-        // such as freezing the cgroup, or increasing its `memory.high`. We don't
-        // want to do these things too often (because postgres needs to run, and
-        // we only have so much memory). These timers serve as rate limits for this.
        let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO));
-        let mut wait_to_increase_memory_high = pin!(tokio::time::sleep(Duration::ZERO));
+        let mut last_memory_high_increase_at: Option<Instant> = None;
        let mut events = pin!(events);

        // Are we waiting to be upscaled? Could be true if we request upscale due
@@ -332,6 +328,8 @@ impl CgroupWatcher {
                upscale = upscales.recv() => {
                    let Sequenced { seqnum, data } = upscale
                        .context("failed to listen on upscale notification channel")?;
+                    waiting_on_upscale = false;
+                    last_memory_high_increase_at = None;
                    self.last_upscale_seqnum.store(seqnum, Ordering::Release);
                    info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
                }
@@ -396,12 +394,17 @@ impl CgroupWatcher {
                            .send(())
                            .await
                            .context("failed to request upscale")?;
+                        waiting_on_upscale = true;
                        continue;
                    }

                    // Shoot, we can't freeze or and we're still waiting on upscale,
                    // increase memory.high to reduce throttling
-                    if wait_to_increase_memory_high.is_elapsed() {
+                    let can_increase_memory_high = match last_memory_high_increase_at {
+                        None => true,
+                        Some(t) => t.elapsed() > self.config.memory_high_increase_every,
+                    };
+                    if can_increase_memory_high {
                        info!(
                            "received memory.high event, \
                            but too soon to refreeze and already requested upscale \
@@ -437,12 +440,11 @@ impl CgroupWatcher {
                        );
                        self.set_high_bytes(new_high)
                            .context("failed to set memory.high")?;
-                        wait_to_increase_memory_high
-                            .as_mut()
-                            .reset(Instant::now() + self.config.memory_high_increase_every)
+                        last_memory_high_increase_at = Some(Instant::now());
+                        continue;
                    }

-                    // we can't do anything
+                    info!("received memory.high event, but can't do anything");
                }
            };
        }
@@ -559,14 +561,7 @@ impl CgroupWatcher {
 /// Setting these values also affects the thresholds for receiving usage alerts.
 #[derive(Debug)]
 pub struct MemoryLimits {
-    high: u64,
-    max: u64,
-}
-
-impl MemoryLimits {
-    pub fn new(high: u64, max: u64) -> Self {
-        Self { max, high }
-    }
+    pub high: u64,
 }

 // Methods for manipulating the actual cgroup
@@ -643,12 +638,7 @@ impl CgroupWatcher {

    /// Set cgroup memory.high and memory.max.
    pub fn set_limits(&self, limits: &MemoryLimits) -> anyhow::Result<()> {
-        info!(
-            limits.high,
-            limits.max,
-            path = self.path(),
-            "writing new memory limits",
-        );
+        info!(limits.high, path = self.path(), "writing new memory limits",);
        self.memory()
            .context("failed to get memory subsystem while setting memory limits")?
            .set_mem(cgroups_rs::memory::SetMemory {
@@ -657,7 +647,7 @@ impl CgroupWatcher {
                high: Some(MaxValue::Value(
                    u64::min(limits.high, i64::MAX as u64) as i64
                )),
-                max: Some(MaxValue::Value(u64::min(limits.max, i64::MAX as u64) as i64)),
+                max: None,
            })
            .context("failed to set memory limits")
    }
@@ -665,7 +655,7 @@ impl CgroupWatcher {
    /// Given some amount of available memory, set the desired cgroup memory limits
    pub fn set_memory_limits(&mut self, available_memory: u64) -> anyhow::Result<()> {
        let new_high = self.config.calculate_memory_high_value(available_memory);
-        let limits = MemoryLimits::new(new_high, available_memory);
+        let limits = MemoryLimits { high: new_high };
        info!(
            path = self.path(),
            memory = ?limits,
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -257,12 +257,11 @@ impl Runner {
                new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
            }

-            let limits = MemoryLimits::new(
+            let limits = MemoryLimits {
                // new_cgroup_mem_high is initialized to 0 but it is guarancontextd to not be here
                // since it is properly initialized in the previous cgroup if let block
-                new_cgroup_mem_high,
-                available_memory,
-            );
+                high: new_cgroup_mem_high,
+            };
            cgroup
                .set_limits(&limits)
                .context("failed to set cgroup memory limits")?;
@@ -328,7 +327,9 @@ impl Runner {
                name = cgroup.path(),
                "updating cgroup memory.high",
            );
-            let limits = MemoryLimits::new(new_cgroup_mem_high, available_memory);
+            let limits = MemoryLimits {
+                high: new_cgroup_mem_high,
+            };
            cgroup
                .set_limits(&limits)
                .context("failed to set file cache size")?;
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -80,11 +80,11 @@ enum-map.workspace = true
 enumset.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
+tempfile.workspace = true

 [dev-dependencies]
 criterion.workspace = true
 hex-literal.workspace = true
-tempfile.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }

 [[bench]]
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -3,6 +3,8 @@
 //! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data.

 use anyhow::Result;
+use pageserver::context::{DownloadBehavior, RequestContext};
+use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
@@ -96,9 +98,9 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
 }

 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
-async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
+async fn get_holes(path: &Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
    let file = FileBlockReader::new(VirtualFile::open(path).await?);
-    let summary_blk = file.read_blk(0).await?;
+    let summary_blk = file.read_blk(0, ctx).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
        actual_summary.index_start_blk,
@@ -125,6 +127,7 @@ async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
                prev_key = Some(curr.next());
                true
            },
+            ctx,
        )
        .await?;
    let mut holes = heap.into_vec();
@@ -135,6 +138,7 @@ async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
 pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let storage_path = &cmd.path;
    let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);
+    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

    // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
    pageserver::virtual_file::init(10);
@@ -163,7 +167,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
                    parse_filename(&layer.file_name().into_string().unwrap())
                {
                    if layer_file.is_delta {
-                        layer_file.holes = get_holes(&layer.path(), max_holes).await?;
+                        layer_file.holes = get_holes(&layer.path(), max_holes, &ctx).await?;
                        n_deltas += 1;
                    }
                    layers.push(layer_file);
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -2,6 +2,8 @@ use std::path::{Path, PathBuf};

 use anyhow::Result;
 use clap::Subcommand;
+use pageserver::context::{DownloadBehavior, RequestContext};
+use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::block_io::BlockCursor;
 use pageserver::tenant::disk_btree::DiskBtreeReader;
 use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
@@ -44,12 +46,12 @@ pub(crate) enum LayerCmd {
    },
 }

-async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
+async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
    let path = path.as_ref();
    virtual_file::init(10);
    page_cache::init(100);
    let file = FileBlockReader::new(VirtualFile::open(path).await?);
-    let summary_blk = file.read_blk(0).await?;
+    let summary_blk = file.read_blk(0, ctx).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
        actual_summary.index_start_blk,
@@ -67,11 +69,12 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
                all.push((curr, BlobRef(value_offset)));
                true
            },
+            ctx,
        )
        .await?;
    let cursor = BlockCursor::new_fileblockreader(&file);
    for (k, v) in all {
-        let value = cursor.read_blob(v.pos()).await?;
+        let value = cursor.read_blob(v.pos(), ctx).await?;
        println!("key:{} value_len:{}", k, value.len());
    }
    // TODO(chi): special handling for last key?
@@ -79,6 +82,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
 }

 pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
+    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
    match cmd {
        LayerCmd::List { path } => {
            for tenant in fs::read_dir(path.join(TENANTS_SEGMENT_NAME))? {
@@ -153,7 +157,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
                        );

                        if layer_file.is_delta {
-                            read_delta_file(layer.path()).await?;
+                            read_delta_file(layer.path(), &ctx).await?;
                        } else {
                            anyhow::bail!("not supported yet :(");
                        }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -8,6 +8,7 @@ use anyhow::{anyhow, Context};
 use clap::{Arg, ArgAction, Command};

 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
+use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
@@ -20,6 +21,7 @@ use metrics::set_build_info_metric;
 use pageserver::{
    config::{defaults::*, PageServerConf},
    context::{DownloadBehavior, RequestContext},
+    deletion_queue::DeletionQueue,
    http, page_cache, page_service, task_mgr,
    task_mgr::TaskKind,
    task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
@@ -346,9 +348,22 @@ fn start_pageserver(
        }
    };

+    // Top-level cancellation token for the process
+    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
+
    // Set up remote storage client
    let remote_storage = create_remote_storage_client(conf)?;

+    // Set up deletion queue
+    let (deletion_queue, deletion_workers) = DeletionQueue::new(
+        remote_storage.clone(),
+        ControlPlaneClient::new(conf, &shutdown_pageserver),
+        conf,
+    );
+    if let Some(deletion_workers) = deletion_workers {
+        deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
+    }
+
    // Up to this point no significant I/O has been done: this should have been fast.  Record
    // duration prior to starting I/O intensive phase of startup.
    startup_checkpoint("initial", "Starting loading tenants");
@@ -379,13 +394,13 @@ fn start_pageserver(
    };

    // Scan the local 'tenants/' directory and start loading the tenants
-    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
-
+    let deletion_queue_client = deletion_queue.new_client();
    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
        TenantSharedResources {
            broker_client: broker_client.clone(),
            remote_storage: remote_storage.clone(),
+            deletion_queue_client,
        },
        order,
        shutdown_pageserver.clone(),
@@ -481,9 +496,10 @@ fn start_pageserver(
            http::routes::State::new(
                conf,
                http_auth.clone(),
-                remote_storage,
+                remote_storage.clone(),
                broker_client.clone(),
                disk_usage_eviction_state,
+                deletion_queue.new_client(),
            )
            .context("Failed to initialize router state")?,
        );
@@ -518,6 +534,9 @@ fn start_pageserver(
            // creates a child context with the right DownloadBehavior.
            DownloadBehavior::Error,
        );
+
+        let local_disk_storage = conf.workdir.join("last_consumption_metrics.json");
+
        task_mgr::spawn(
            crate::BACKGROUND_RUNTIME.handle(),
            TaskKind::MetricsCollection,
@@ -544,6 +563,7 @@ fn start_pageserver(
                    conf.cached_metric_collection_interval,
                    conf.synthetic_size_calculation_interval,
                    conf.id,
+                    local_disk_storage,
                    metrics_ctx,
                )
                .instrument(info_span!("metrics_collection"))
@@ -607,7 +627,12 @@ fn start_pageserver(
            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
            // The plan is to change that over time.
            shutdown_pageserver.take();
-            BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
+            let bg_remote_storage = remote_storage.clone();
+            let bg_deletion_queue = deletion_queue.clone();
+            BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
+                bg_remote_storage.map(|_| bg_deletion_queue),
+                0,
+            ));
            unreachable!()
        }
    })
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -11,6 +11,7 @@ use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
+use utils::logging::SecretString;

 use once_cell::sync::OnceCell;
 use reqwest::Url;
@@ -64,7 +65,7 @@ pub mod defaults {
        super::ConfigurableSemaphore::DEFAULT_INITIAL.get();

    pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
-    pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour";
+    pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "0s";
    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
@@ -207,6 +208,9 @@ pub struct PageServerConf {
    pub background_task_maximum_delay: Duration,

    pub control_plane_api: Option<Url>,
+
+    /// JWT token for use with the control plane API.
+    pub control_plane_api_token: Option<SecretString>,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -283,6 +287,7 @@ struct PageServerConfigBuilder {
    background_task_maximum_delay: BuilderValue<Duration>,

    control_plane_api: BuilderValue<Option<Url>>,
+    control_plane_api_token: BuilderValue<Option<SecretString>>,
 }

 impl Default for PageServerConfigBuilder {
@@ -347,6 +352,7 @@ impl Default for PageServerConfigBuilder {
            .unwrap()),

            control_plane_api: Set(None),
+            control_plane_api_token: Set(None),
        }
    }
 }
@@ -475,8 +481,8 @@ impl PageServerConfigBuilder {
        self.background_task_maximum_delay = BuilderValue::Set(delay);
    }

-    pub fn control_plane_api(&mut self, api: Url) {
-        self.control_plane_api = BuilderValue::Set(Some(api))
+    pub fn control_plane_api(&mut self, api: Option<Url>) {
+        self.control_plane_api = BuilderValue::Set(api)
    }

    pub fn build(self) -> anyhow::Result<PageServerConf> {
@@ -567,6 +573,9 @@ impl PageServerConfigBuilder {
            control_plane_api: self
                .control_plane_api
                .ok_or(anyhow!("missing control_plane_api"))?,
+            control_plane_api_token: self
+                .control_plane_api_token
+                .ok_or(anyhow!("missing control_plane_api_token"))?,
        })
    }
 }
@@ -580,6 +589,27 @@ impl PageServerConf {
        self.workdir.join(TENANTS_SEGMENT_NAME)
    }

+    pub fn deletion_prefix(&self) -> PathBuf {
+        self.workdir.join("deletion")
+    }
+
+    pub fn deletion_list_path(&self, sequence: u64) -> PathBuf {
+        // Encode a version in the filename, so that if we ever switch away from JSON we can
+        // increment this.
+        const VERSION: u8 = 1;
+
+        self.deletion_prefix()
+            .join(format!("{sequence:016x}-{VERSION:02x}.list"))
+    }
+
+    pub fn deletion_header_path(&self) -> PathBuf {
+        // Encode a version in the filename, so that if we ever switch away from JSON we can
+        // increment this.
+        const VERSION: u8 = 1;
+
+        self.deletion_prefix().join(format!("header-{VERSION:02x}"))
+    }
+
    pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf {
        self.tenants_path().join(tenant_id.to_string())
    }
@@ -747,7 +777,14 @@ impl PageServerConf {
                },
                "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
                "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
-                "control_plane_api" => builder.control_plane_api(parse_toml_string(key, item)?.parse().context("failed to parse control plane URL")?),
+                "control_plane_api" => {
+                    let parsed = parse_toml_string(key, item)?;
+                    if parsed.is_empty() {
+                        builder.control_plane_api(None)
+                    } else {
+                        builder.control_plane_api(Some(parsed.parse().context("failed to parse control plane URL")?))
+                    }
+                },
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -917,6 +954,7 @@ impl PageServerConf {
            ondemand_download_behavior_treat_error_as_warn: false,
            background_task_maximum_delay: Duration::ZERO,
            control_plane_api: None,
+            control_plane_api_token: None,
        }
    }
 }
@@ -1140,7 +1178,8 @@ background_task_maximum_delay = '334 s'
                background_task_maximum_delay: humantime::parse_duration(
                    defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
                )?,
-                control_plane_api: None
+                control_plane_api: None,
+                control_plane_api_token: None
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1196,7 +1235,8 @@ background_task_maximum_delay = '334 s'
                test_remote_failures: 0,
                ondemand_download_behavior_treat_error_as_warn: false,
                background_task_maximum_delay: Duration::from_secs(334),
-                control_plane_api: None
+                control_plane_api: None,
+                control_plane_api_token: None
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -1,188 +1,54 @@
-//!
 //! Periodically collect consumption metrics for all active tenants
 //! and push them to a HTTP endpoint.
-//! Cache metrics to send only the updated ones.
-//!
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::{mgr, LogicalSizeCalculationCause};
-use anyhow;
-use chrono::{DateTime, Utc};
-use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
+use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
 use reqwest::Url;
-use serde::Serialize;
-use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
+use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::{Duration, SystemTime};
 use tracing::*;
-use utils::id::{NodeId, TenantId, TimelineId};
-use utils::lsn::Lsn;
+use utils::id::NodeId;
+
+mod metrics;
+use metrics::MetricsKey;
+mod disk_cache;
+mod upload;

 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);

-#[serde_as]
-#[derive(Serialize, Debug, Clone, Copy)]
-struct Ids {
-    #[serde_as(as = "DisplayFromStr")]
-    tenant_id: TenantId,
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    #[serde(skip_serializing_if = "Option::is_none")]
-    timeline_id: Option<TimelineId>,
-}
+/// Basically a key-value pair, but usually in a Vec except for [`Cache`].
+///
+/// This is as opposed to `consumption_metrics::Event` which is the externally communicated form.
+/// Difference is basically the missing idempotency key, which lives only for the duration of
+/// upload attempts.
+type RawMetric = (MetricsKey, (EventType, u64));

-/// Key that uniquely identifies the object, this metric describes.
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
-struct MetricsKey {
-    tenant_id: TenantId,
-    timeline_id: Option<TimelineId>,
-    metric: &'static str,
-}
-
-impl MetricsKey {
-    const fn absolute_values(self) -> AbsoluteValueFactory {
-        AbsoluteValueFactory(self)
-    }
-    const fn incremental_values(self) -> IncrementalValueFactory {
-        IncrementalValueFactory(self)
-    }
-}
-
-/// Helper type which each individual metric kind can return to produce only absolute values.
-struct AbsoluteValueFactory(MetricsKey);
-
-impl AbsoluteValueFactory {
-    fn at(self, time: DateTime<Utc>, val: u64) -> (MetricsKey, (EventType, u64)) {
-        let key = self.0;
-        (key, (EventType::Absolute { time }, val))
-    }
-}
-
-/// Helper type which each individual metric kind can return to produce only incremental values.
-struct IncrementalValueFactory(MetricsKey);
-
-impl IncrementalValueFactory {
-    #[allow(clippy::wrong_self_convention)]
-    fn from_previous_up_to(
-        self,
-        prev_end: DateTime<Utc>,
-        up_to: DateTime<Utc>,
-        val: u64,
-    ) -> (MetricsKey, (EventType, u64)) {
-        let key = self.0;
-        // cannot assert prev_end < up_to because these are realtime clock based
-        (
-            key,
-            (
-                EventType::Incremental {
-                    start_time: prev_end,
-                    stop_time: up_to,
-                },
-                val,
-            ),
-        )
-    }
-
-    fn key(&self) -> &MetricsKey {
-        &self.0
-    }
-}
-
-// the static part of a MetricsKey
-impl MetricsKey {
-    /// Absolute value of [`Timeline::get_last_record_lsn`].
-    ///
-    /// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn
-    const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            metric: "written_size",
-        }
-        .absolute_values()
-    }
-
-    /// Values will be the difference of the latest [`MetricsKey::written_size`] to what we
-    /// previously sent, starting from the previously sent incremental time range ending at the
-    /// latest absolute measurement.
-    const fn written_size_delta(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> IncrementalValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            // the name here is correctly about data not size, because that is what is wanted by
-            // downstream pipeline
-            metric: "written_data_bytes_delta",
-        }
-        .incremental_values()
-    }
-
-    /// Exact [`Timeline::get_current_logical_size`].
-    ///
-    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
-    const fn timeline_logical_size(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            metric: "timeline_logical_size",
-        }
-        .absolute_values()
-    }
-
-    /// [`Tenant::remote_size`]
-    ///
-    /// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
-    const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: "remote_storage_size",
-        }
-        .absolute_values()
-    }
-
-    /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
-    ///
-    /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
-    const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: "resident_size",
-        }
-        .absolute_values()
-    }
-
-    /// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
-    ///
-    /// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
-    const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: "synthetic_storage_size",
-        }
-        .absolute_values()
-    }
-}
+/// Caches the [`RawMetric`]s
+///
+/// In practice, during startup, last sent values are stored here to be used in calculating new
+/// ones. After successful uploading, the cached values are updated to cache. This used to be used
+/// for deduplication, but that is no longer needed.
+type Cache = HashMap<MetricsKey, (EventType, u64)>;

 /// Main thread that serves metrics collection
 pub async fn collect_metrics(
    metric_collection_endpoint: &Url,
    metric_collection_interval: Duration,
-    cached_metric_collection_interval: Duration,
+    _cached_metric_collection_interval: Duration,
    synthetic_size_calculation_interval: Duration,
    node_id: NodeId,
+    local_disk_storage: PathBuf,
    ctx: RequestContext,
 ) -> anyhow::Result<()> {
-    let mut ticker = tokio::time::interval(metric_collection_interval);
-    info!("starting collect_metrics");
+    if _cached_metric_collection_interval != Duration::ZERO {
+        tracing::warn!(
+            "cached_metric_collection_interval is no longer used, please set it to zero."
+        )
+    }

    // spin up background worker that caclulates tenant sizes
    let worker_ctx =
@@ -202,543 +68,216 @@ pub async fn collect_metrics(
        },
    );

+    let path: Arc<PathBuf> = Arc::new(local_disk_storage);
+
+    let cancel = task_mgr::shutdown_token();
+
+    let restore_and_reschedule = restore_and_reschedule(&path, metric_collection_interval);
+
+    let mut cached_metrics = tokio::select! {
+        _ = cancel.cancelled() => return Ok(()),
+        ret = restore_and_reschedule => ret,
+    };
+
    // define client here to reuse it for all requests
    let client = reqwest::ClientBuilder::new()
        .timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
        .build()
        .expect("Failed to create http client with timeout");
-    let mut cached_metrics = HashMap::new();
-    let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();
-
-    loop {
-        tokio::select! {
-            _ = task_mgr::shutdown_watcher() => {
-                info!("collect_metrics received cancellation request");
-                return Ok(());
-            },
-            tick_at = ticker.tick() => {
-
-                // send cached metrics every cached_metric_collection_interval
-                let send_cached = prev_iteration_time.elapsed() >= cached_metric_collection_interval;
-
-                if send_cached {
-                    prev_iteration_time = std::time::Instant::now();
-                }
-
-                collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await;
-
-                crate::tenant::tasks::warn_when_period_overrun(
-                    tick_at.elapsed(),
-                    metric_collection_interval,
-                    "consumption_metrics_collect_metrics",
-                );
-            }
-        }
-    }
-}
-
-/// One iteration of metrics collection
-///
-/// Gather per-tenant and per-timeline metrics and send them to the `metric_collection_endpoint`.
-/// Cache metrics to avoid sending the same metrics multiple times.
-///
-/// This function handles all errors internally
-/// and doesn't break iteration if just one tenant fails.
-///
-/// TODO
-/// - refactor this function (chunking+sending part) to reuse it in proxy module;
-async fn collect_metrics_iteration(
-    client: &reqwest::Client,
-    cached_metrics: &mut HashMap<MetricsKey, (EventType, u64)>,
-    metric_collection_endpoint: &reqwest::Url,
-    node_id: NodeId,
-    ctx: &RequestContext,
-    send_cached: bool,
-) {
-    let mut current_metrics: Vec<(MetricsKey, (EventType, u64))> = Vec::new();
-    trace!(
-        "starting collect_metrics_iteration. metric_collection_endpoint: {}",
-        metric_collection_endpoint
-    );
-
-    // get list of tenants
-    let tenants = match mgr::list_tenants().await {
-        Ok(tenants) => tenants,
-        Err(err) => {
-            error!("failed to list tenants: {:?}", err);
-            return;
-        }
-    };
-
-    // iterate through list of Active tenants and collect metrics
-    for (tenant_id, tenant_state) in tenants {
-        if tenant_state != TenantState::Active {
-            continue;
-        }
-
-        let tenant = match mgr::get_tenant(tenant_id, true).await {
-            Ok(tenant) => tenant,
-            Err(err) => {
-                // It is possible that tenant was deleted between
-                // `list_tenants` and `get_tenant`, so just warn about it.
-                warn!("failed to get tenant {tenant_id:?}: {err:?}");
-                continue;
-            }
-        };
-
-        let mut tenant_resident_size = 0;
-
-        // iterate through list of timelines in tenant
-        for timeline in tenant.list_timelines() {
-            // collect per-timeline metrics only for active timelines
-
-            let timeline_id = timeline.timeline_id;
-
-            match TimelineSnapshot::collect(&timeline, ctx) {
-                Ok(Some(snap)) => {
-                    snap.to_metrics(
-                        tenant_id,
-                        timeline_id,
-                        Utc::now(),
-                        &mut current_metrics,
-                        cached_metrics,
-                    );
-                }
-                Ok(None) => {}
-                Err(e) => {
-                    error!(
-                        "failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}",
-                        timeline.timeline_id
-                    );
-                    continue;
-                }
-            }
-
-            tenant_resident_size += timeline.resident_physical_size();
-        }
-
-        current_metrics
-            .push(MetricsKey::remote_storage_size(tenant_id).at(Utc::now(), tenant.remote_size()));
-
-        current_metrics
-            .push(MetricsKey::resident_size(tenant_id).at(Utc::now(), tenant_resident_size));
-
-        // Note that this metric is calculated in a separate bgworker
-        // Here we only use cached value, which may lag behind the real latest one
-        let synthetic_size = tenant.cached_synthetic_size();
-
-        if synthetic_size != 0 {
-            // only send non-zeroes because otherwise these show up as errors in logs
-            current_metrics
-                .push(MetricsKey::synthetic_size(tenant_id).at(Utc::now(), synthetic_size));
-        }
-    }
-
-    // Filter metrics, unless we want to send all metrics, including cached ones.
-    // See: https://github.com/neondatabase/neon/issues/3485
-    if !send_cached {
-        current_metrics.retain(|(curr_key, (kind, curr_val))| {
-            if kind.is_incremental() {
-                // incremental values (currently only written_size_delta) should not get any cache
-                // deduplication because they will be used by upstream for "is still alive."
-                true
-            } else {
-                match cached_metrics.get(curr_key) {
-                    Some((_, val)) => val != curr_val,
-                    None => true,
-                }
-            }
-        });
-    }
-
-    if current_metrics.is_empty() {
-        trace!("no new metrics to send");
-        return;
-    }
-
-    // Send metrics.
-    // Split into chunks of 1000 metrics to avoid exceeding the max request size
-    let chunks = current_metrics.chunks(CHUNK_SIZE);
-
-    let mut chunk_to_send: Vec<Event<Ids>> = Vec::with_capacity(CHUNK_SIZE);

    let node_id = node_id.to_string();

-    for chunk in chunks {
-        chunk_to_send.clear();
+    // reminder: ticker is ready immediatedly
+    let mut ticker = tokio::time::interval(metric_collection_interval);

-        // enrich metrics with type,timestamp and idempotency key before sending
-        chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event {
-            kind: *when,
-            metric: curr_key.metric,
-            idempotency_key: idempotency_key(&node_id),
-            value: *curr_val,
-            extra: Ids {
-                tenant_id: curr_key.tenant_id,
-                timeline_id: curr_key.timeline_id,
-            },
-        }));
+    loop {
+        let tick_at = tokio::select! {
+            _ = cancel.cancelled() => return Ok(()),
+            tick_at = ticker.tick() => tick_at,
+        };

-        const MAX_RETRIES: u32 = 3;
+        // these are point in time, with variable "now"
+        let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;

-        for attempt in 0..MAX_RETRIES {
-            let res = client
-                .post(metric_collection_endpoint.clone())
-                .json(&EventChunk {
-                    events: (&chunk_to_send).into(),
-                })
-                .send()
-                .await;
+        if metrics.is_empty() {
+            continue;
+        }

-            match res {
-                Ok(res) => {
-                    if res.status().is_success() {
-                        // update cached metrics after they were sent successfully
-                        for (curr_key, curr_val) in chunk.iter() {
-                            cached_metrics.insert(curr_key.clone(), *curr_val);
-                        }
-                    } else {
-                        error!("metrics endpoint refused the sent metrics: {:?}", res);
-                        for metric in chunk_to_send
-                            .iter()
-                            .filter(|metric| metric.value > (1u64 << 40))
-                        {
-                            // Report if the metric value is suspiciously large
-                            error!("potentially abnormal metric value: {:?}", metric);
-                        }
-                    }
-                    break;
+        let metrics = Arc::new(metrics);
+
+        // why not race cancellation here? because we are one of the last tasks, and if we are
+        // already here, better to try to flush the new values.
+
+        let flush = async {
+            match disk_cache::flush_metrics_to_disk(&metrics, &path).await {
+                Ok(()) => {
+                    tracing::debug!("flushed metrics to disk");
                }
-                Err(err) if err.is_timeout() => {
-                    error!(attempt, "timeout sending metrics, retrying immediately");
-                    continue;
-                }
-                Err(err) => {
-                    error!(attempt, ?err, "failed to send metrics");
-                    break;
+                Err(e) => {
+                    // idea here is that if someone creates a directory as our path, then they
+                    // might notice it from the logs before shutdown and remove it
+                    tracing::error!("failed to persist metrics to {path:?}: {e:#}");
                }
            }
+        };
+
+        let upload = async {
+            let res = upload::upload_metrics(
+                &client,
+                metric_collection_endpoint,
+                &cancel,
+                &node_id,
+                &metrics,
+                &mut cached_metrics,
+            )
+            .await;
+            if let Err(e) = res {
+                // serialization error which should never happen
+                tracing::error!("failed to upload due to {e:#}");
+            }
+        };
+
+        // let these run concurrently
+        let (_, _) = tokio::join!(flush, upload);
+
+        crate::tenant::tasks::warn_when_period_overrun(
+            tick_at.elapsed(),
+            metric_collection_interval,
+            "consumption_metrics_collect_metrics",
+        );
+    }
+}
+
+/// Called on the first iteration in an attempt to join the metric uploading schedule from previous
+/// pageserver session. Pageserver is supposed to upload at intervals regardless of restarts.
+///
+/// Cancellation safe.
+async fn restore_and_reschedule(
+    path: &Arc<PathBuf>,
+    metric_collection_interval: Duration,
+) -> Cache {
+    let (cached, earlier_metric_at) = match disk_cache::read_metrics_from_disk(path.clone()).await {
+        Ok(found_some) => {
+            // there is no min needed because we write these sequentially in
+            // collect_all_metrics
+            let earlier_metric_at = found_some
+                .iter()
+                .map(|(_, (et, _))| et.recorded_at())
+                .copied()
+                .next();
+
+            let cached = found_some.into_iter().collect::<Cache>();
+
+            (cached, earlier_metric_at)
+        }
+        Err(e) => {
+            use std::io::{Error, ErrorKind};
+
+            let root = e.root_cause();
+            let maybe_ioerr = root.downcast_ref::<Error>();
+            let is_not_found = maybe_ioerr.is_some_and(|e| e.kind() == ErrorKind::NotFound);
+
+            if !is_not_found {
+                tracing::info!("failed to read any previous metrics from {path:?}: {e:#}");
+            }
+
+            (HashMap::new(), None)
+        }
+    };
+
+    if let Some(earlier_metric_at) = earlier_metric_at {
+        let earlier_metric_at: SystemTime = earlier_metric_at.into();
+
+        let error = reschedule(earlier_metric_at, metric_collection_interval).await;
+
+        if let Some(error) = error {
+            if error.as_secs() >= 60 {
+                tracing::info!(
+                    error_ms = error.as_millis(),
+                    "startup scheduling error due to restart"
+                )
+            }
        }
    }
+
+    cached
 }

-/// Internal type to make timeline metric production testable.
-///
-/// As this value type contains all of the information needed from a timeline to produce the
-/// metrics, it can easily be created with different values in test.
-struct TimelineSnapshot {
-    loaded_at: (Lsn, SystemTime),
-    last_record_lsn: Lsn,
-    current_exact_logical_size: Option<u64>,
-}
+async fn reschedule(
+    earlier_metric_at: SystemTime,
+    metric_collection_interval: Duration,
+) -> Option<Duration> {
+    let now = SystemTime::now();
+    match now.duration_since(earlier_metric_at) {
+        Ok(from_last_send) if from_last_send < metric_collection_interval => {
+            let sleep_for = metric_collection_interval - from_last_send;

-impl TimelineSnapshot {
-    /// Collect the metrics from an actual timeline.
-    ///
-    /// Fails currently only when [`Timeline::get_current_logical_size`] fails.
-    ///
-    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
-    fn collect(
-        t: &Arc<crate::tenant::Timeline>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Option<Self>> {
-        use anyhow::Context;
+            let deadline = std::time::Instant::now() + sleep_for;

-        if !t.is_active() {
-            // no collection for broken or stopping needed, we will still keep the cached values
-            // though at the caller.
-            Ok(None)
-        } else {
-            let loaded_at = t.loaded_at;
-            let last_record_lsn = t.get_last_record_lsn();
+            tokio::time::sleep_until(deadline.into()).await;

-            let current_exact_logical_size = {
-                let span = info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
-                let res = span
-                    .in_scope(|| t.get_current_logical_size(ctx))
-                    .context("get_current_logical_size");
-                match res? {
-                    // Only send timeline logical size when it is fully calculated.
-                    (size, is_exact) if is_exact => Some(size),
-                    (_, _) => None,
-                }
-            };
+            let now = std::time::Instant::now();

-            Ok(Some(TimelineSnapshot {
-                loaded_at,
-                last_record_lsn,
-                current_exact_logical_size,
-            }))
-        }
-    }
-
-    /// Produce the timeline consumption metrics into the `metrics` argument.
-    fn to_metrics(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        now: DateTime<Utc>,
-        metrics: &mut Vec<(MetricsKey, (EventType, u64))>,
-        cache: &HashMap<MetricsKey, (EventType, u64)>,
-    ) {
-        let timeline_written_size = u64::from(self.last_record_lsn);
-
-        let (key, written_size_now) =
-            MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
-
-        // last_record_lsn can only go up, right now at least, TODO: #2592 or related
-        // features might change this.
-
-        let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
-
-        // use this when available, because in a stream of incremental values, it will be
-        // accurate where as when last_record_lsn stops moving, we will only cache the last
-        // one of those.
-        let last_stop_time = cache
-            .get(written_size_delta_key.key())
-            .map(|(until, _val)| {
-                until
-                    .incremental_timerange()
-                    .expect("never create EventType::Absolute for written_size_delta")
-                    .end
-            });
-
-        // by default, use the last sent written_size as the basis for
-        // calculating the delta. if we don't yet have one, use the load time value.
-        let prev = cache
-            .get(&key)
-            .map(|(prev_at, prev)| {
-                // use the prev time from our last incremental update, or default to latest
-                // absolute update on the first round.
-                let prev_at = prev_at
-                    .absolute_time()
-                    .expect("never create EventType::Incremental for written_size");
-                let prev_at = last_stop_time.unwrap_or(prev_at);
-                (*prev_at, *prev)
-            })
-            .unwrap_or_else(|| {
-                // if we don't have a previous point of comparison, compare to the load time
-                // lsn.
-                let (disk_consistent_lsn, loaded_at) = &self.loaded_at;
-                (DateTime::from(*loaded_at), disk_consistent_lsn.0)
-            });
-
-        // written_size_bytes_delta
-        metrics.extend(
-            if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
-                let up_to = written_size_now
-                    .0
-                    .absolute_time()
-                    .expect("never create EventType::Incremental for written_size");
-                let key_value = written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta);
-                Some(key_value)
+            // executor threads might be busy, add extra measurements
+            Some(if now < deadline {
+                deadline - now
            } else {
-                None
-            },
-        );
-
-        // written_size
-        metrics.push((key, written_size_now));
-
-        if let Some(size) = self.current_exact_logical_size {
-            metrics.push(MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, size));
+                now - deadline
+            })
+        }
+        Ok(from_last_send) => Some(from_last_send.saturating_sub(metric_collection_interval)),
+        Err(_) => {
+            tracing::warn!(
+                ?now,
+                ?earlier_metric_at,
+                "oldest recorded metric is in future; first values will come out with inconsistent timestamps"
+            );
+            earlier_metric_at.duration_since(now).ok()
        }
    }
 }

 /// Caclculate synthetic size for each active tenant
-pub async fn calculate_synthetic_size_worker(
+async fn calculate_synthetic_size_worker(
    synthetic_size_calculation_interval: Duration,
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
    info!("starting calculate_synthetic_size_worker");

+    // reminder: ticker is ready immediatedly
    let mut ticker = tokio::time::interval(synthetic_size_calculation_interval);
+    let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;

    loop {
-        tokio::select! {
-            _ = task_mgr::shutdown_watcher() => {
-                return Ok(());
-            },
-            tick_at = ticker.tick() => {
+        let tick_at = tokio::select! {
+            _ = task_mgr::shutdown_watcher() => return Ok(()),
+            tick_at = ticker.tick() => tick_at,
+        };

-                let tenants = match mgr::list_tenants().await {
-                    Ok(tenants) => tenants,
-                    Err(e) => {
-                        warn!("cannot get tenant list: {e:#}");
-                        continue;
-                    }
-                };
-                // iterate through list of Active tenants and collect metrics
-                for (tenant_id, tenant_state) in tenants {
+        let tenants = match mgr::list_tenants().await {
+            Ok(tenants) => tenants,
+            Err(e) => {
+                warn!("cannot get tenant list: {e:#}");
+                continue;
+            }
+        };

-                    if tenant_state != TenantState::Active {
-                        continue;
-                    }
-
-                    if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await
-                    {
-                        if let Err(e) = tenant.calculate_synthetic_size(
-                            LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize,
-                            ctx).await {
-                            error!("failed to calculate synthetic size for tenant {}: {}", tenant_id, e);
-                        }
-                    }
+        for (tenant_id, tenant_state) in tenants {
+            if tenant_state != TenantState::Active {
+                continue;
+            }

+            if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
+                if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await {
+                    error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
                }
-
-                crate::tenant::tasks::warn_when_period_overrun(
-                    tick_at.elapsed(),
-                    synthetic_size_calculation_interval,
-                    "consumption_metrics_synthetic_size_worker",
-                );
            }
        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::collections::HashMap;
-
-    use std::time::SystemTime;
-    use utils::{
-        id::{TenantId, TimelineId},
-        lsn::Lsn,
-    };
-
-    use crate::consumption_metrics::MetricsKey;
-
-    use super::TimelineSnapshot;
-    use chrono::{DateTime, Utc};
-
-    #[test]
-    fn startup_collected_timeline_metrics_before_advancing() {
-        let tenant_id = TenantId::generate();
-        let timeline_id = TimelineId::generate();
-
-        let mut metrics = Vec::new();
-        let cache = HashMap::new();
-
-        let initdb_lsn = Lsn(0x10000);
-        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
-
-        let snap = TimelineSnapshot {
-            loaded_at: (disk_consistent_lsn, SystemTime::now()),
-            last_record_lsn: disk_consistent_lsn,
-            current_exact_logical_size: Some(0x42000),
-        };
-
-        let now = DateTime::<Utc>::from(SystemTime::now());
-
-        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
-
-        assert_eq!(
-            metrics,
-            &[
-                MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
-                    snap.loaded_at.1.into(),
-                    now,
-                    0
-                ),
-                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
-            ]
-        );
-    }
-
-    #[test]
-    fn startup_collected_timeline_metrics_second_round() {
-        let tenant_id = TenantId::generate();
-        let timeline_id = TimelineId::generate();
-
-        let [now, before, init] = time_backwards();
-
-        let now = DateTime::<Utc>::from(now);
-        let before = DateTime::<Utc>::from(before);
-
-        let initdb_lsn = Lsn(0x10000);
-        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
-
-        let mut metrics = Vec::new();
-        let cache = HashMap::from([
-            MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
-        ]);
-
-        let snap = TimelineSnapshot {
-            loaded_at: (disk_consistent_lsn, init),
-            last_record_lsn: disk_consistent_lsn,
-            current_exact_logical_size: Some(0x42000),
-        };
-
-        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
-
-        assert_eq!(
-            metrics,
-            &[
-                MetricsKey::written_size_delta(tenant_id, timeline_id)
-                    .from_previous_up_to(before, now, 0),
-                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
-            ]
-        );
-    }
-
-    #[test]
-    fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
-        let tenant_id = TenantId::generate();
-        let timeline_id = TimelineId::generate();
-
-        let [now, just_before, before, init] = time_backwards();
-
-        let now = DateTime::<Utc>::from(now);
-        let just_before = DateTime::<Utc>::from(just_before);
-        let before = DateTime::<Utc>::from(before);
-
-        let initdb_lsn = Lsn(0x10000);
-        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
-
-        let mut metrics = Vec::new();
-        let cache = HashMap::from([
-            // at t=before was the last time the last_record_lsn changed
-            MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
-            // end time of this event is used for the next ones
-            MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
-                before,
-                just_before,
-                0,
-            ),
-        ]);
-
-        let snap = TimelineSnapshot {
-            loaded_at: (disk_consistent_lsn, init),
-            last_record_lsn: disk_consistent_lsn,
-            current_exact_logical_size: Some(0x42000),
-        };
-
-        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
-
-        assert_eq!(
-            metrics,
-            &[
-                MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
-                    just_before,
-                    now,
-                    0
-                ),
-                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
-            ]
-        );
-    }
-
-    fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
-        let mut times = [std::time::SystemTime::UNIX_EPOCH; N];
-        times[0] = std::time::SystemTime::now();
-        for behind in 1..N {
-            times[behind] = times[0] - std::time::Duration::from_secs(behind as u64);
-        }
-
-        times
+
+        crate::tenant::tasks::warn_when_period_overrun(
+            tick_at.elapsed(),
+            synthetic_size_calculation_interval,
+            "consumption_metrics_synthetic_size_worker",
+        );
    }
 }
--- a/pageserver/src/consumption_metrics/disk_cache.rs
+++ b/pageserver/src/consumption_metrics/disk_cache.rs
@@ -0,0 +1,117 @@
+use anyhow::Context;
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use super::RawMetric;
+
+pub(super) async fn read_metrics_from_disk(path: Arc<PathBuf>) -> anyhow::Result<Vec<RawMetric>> {
+    // do not add context to each error, callsite will log with full path
+    let span = tracing::Span::current();
+    tokio::task::spawn_blocking(move || {
+        let _e = span.entered();
+
+        if let Some(parent) = path.parent() {
+            if let Err(e) = scan_and_delete_with_same_prefix(&path) {
+                tracing::info!("failed to cleanup temporary files in {parent:?}: {e:#}");
+            }
+        }
+
+        let mut file = std::fs::File::open(&*path)?;
+        let reader = std::io::BufReader::new(&mut file);
+        anyhow::Ok(serde_json::from_reader::<_, Vec<RawMetric>>(reader)?)
+    })
+    .await
+    .context("read metrics join error")
+    .and_then(|x| x)
+}
+
+fn scan_and_delete_with_same_prefix(path: &std::path::Path) -> std::io::Result<()> {
+    let it = std::fs::read_dir(path.parent().expect("caller checked"))?;
+
+    let prefix = path.file_name().expect("caller checked").to_string_lossy();
+
+    for entry in it {
+        let entry = entry?;
+        if !entry.metadata()?.is_file() {
+            continue;
+        }
+        let file_name = entry.file_name();
+
+        if path.file_name().unwrap() == file_name {
+            // do not remove our actual file
+            continue;
+        }
+
+        let file_name = file_name.to_string_lossy();
+
+        if !file_name.starts_with(&*prefix) {
+            continue;
+        }
+
+        let path = entry.path();
+
+        if let Err(e) = std::fs::remove_file(&path) {
+            tracing::warn!("cleaning up old tempfile {file_name:?} failed: {e:#}");
+        } else {
+            tracing::info!("cleaned up old tempfile {file_name:?}");
+        }
+    }
+
+    Ok(())
+}
+
+pub(super) async fn flush_metrics_to_disk(
+    current_metrics: &Arc<Vec<RawMetric>>,
+    path: &Arc<PathBuf>,
+) -> anyhow::Result<()> {
+    use std::io::Write;
+
+    anyhow::ensure!(path.parent().is_some(), "path must have parent: {path:?}");
+    anyhow::ensure!(
+        path.file_name().is_some(),
+        "path must have filename: {path:?}"
+    );
+
+    let span = tracing::Span::current();
+    tokio::task::spawn_blocking({
+        let current_metrics = current_metrics.clone();
+        let path = path.clone();
+        move || {
+            let _e = span.entered();
+
+            let parent = path.parent().expect("existence checked");
+            let file_name = path.file_name().expect("existence checked");
+            let mut tempfile = tempfile::Builder::new()
+                .prefix(file_name)
+                .suffix(".tmp")
+                .tempfile_in(parent)?;
+
+            tracing::debug!("using tempfile {:?}", tempfile.path());
+
+            // write out all of the raw metrics, to be read out later on restart as cached values
+            {
+                let mut writer = std::io::BufWriter::new(&mut tempfile);
+                serde_json::to_writer(&mut writer, &*current_metrics)
+                    .context("serialize metrics")?;
+                writer
+                    .into_inner()
+                    .map_err(|_| anyhow::anyhow!("flushing metrics failed"))?;
+            }
+
+            tempfile.flush()?;
+            tempfile.as_file().sync_all()?;
+
+            fail::fail_point!("before-persist-last-metrics-collected");
+
+            drop(tempfile.persist(&*path).map_err(|e| e.error)?);
+
+            let f = std::fs::File::open(path.parent().unwrap())?;
+            f.sync_all()?;
+
+            anyhow::Ok(())
+        }
+    })
+    .await
+    .with_context(|| format!("write metrics to {path:?} join error"))
+    .and_then(|x| x.with_context(|| format!("write metrics to {path:?}")))
+}
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -0,0 +1,455 @@
+use crate::context::RequestContext;
+use anyhow::Context;
+use chrono::{DateTime, Utc};
+use consumption_metrics::EventType;
+use futures::stream::StreamExt;
+use serde_with::serde_as;
+use std::{sync::Arc, time::SystemTime};
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+use super::{Cache, RawMetric};
+
+/// Name of the metric, used by `MetricsKey` factory methods and `deserialize_cached_events`
+/// instead of static str.
+// Do not rename any of these without first consulting with data team and partner
+// management.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
+pub(super) enum Name {
+    /// Timeline last_record_lsn, absolute
+    #[serde(rename = "written_size")]
+    WrittenSize,
+    /// Timeline last_record_lsn, incremental
+    #[serde(rename = "written_data_bytes_delta")]
+    WrittenSizeDelta,
+    /// Timeline logical size
+    #[serde(rename = "timeline_logical_size")]
+    LogicalSize,
+    /// Tenant remote size
+    #[serde(rename = "remote_storage_size")]
+    RemoteSize,
+    /// Tenant resident size
+    #[serde(rename = "resident_size")]
+    ResidentSize,
+    /// Tenant synthetic size
+    #[serde(rename = "synthetic_storage_size")]
+    SyntheticSize,
+}
+
+/// Key that uniquely identifies the object this metric describes.
+///
+/// This is a denormalization done at the MetricsKey const methods; these should not be constructed
+/// elsewhere.
+#[serde_with::serde_as]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
+pub(crate) struct MetricsKey {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub(super) tenant_id: TenantId,
+
+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub(super) timeline_id: Option<TimelineId>,
+
+    pub(super) metric: Name,
+}
+
+impl MetricsKey {
+    const fn absolute_values(self) -> AbsoluteValueFactory {
+        AbsoluteValueFactory(self)
+    }
+    const fn incremental_values(self) -> IncrementalValueFactory {
+        IncrementalValueFactory(self)
+    }
+}
+
+/// Helper type which each individual metric kind can return to produce only absolute values.
+struct AbsoluteValueFactory(MetricsKey);
+
+impl AbsoluteValueFactory {
+    const fn at(self, time: DateTime<Utc>, val: u64) -> RawMetric {
+        let key = self.0;
+        (key, (EventType::Absolute { time }, val))
+    }
+
+    fn key(&self) -> &MetricsKey {
+        &self.0
+    }
+}
+
+/// Helper type which each individual metric kind can return to produce only incremental values.
+struct IncrementalValueFactory(MetricsKey);
+
+impl IncrementalValueFactory {
+    #[allow(clippy::wrong_self_convention)]
+    const fn from_until(
+        self,
+        prev_end: DateTime<Utc>,
+        up_to: DateTime<Utc>,
+        val: u64,
+    ) -> RawMetric {
+        let key = self.0;
+        // cannot assert prev_end < up_to because these are realtime clock based
+        let when = EventType::Incremental {
+            start_time: prev_end,
+            stop_time: up_to,
+        };
+        (key, (when, val))
+    }
+
+    fn key(&self) -> &MetricsKey {
+        &self.0
+    }
+}
+
+// the static part of a MetricsKey
+impl MetricsKey {
+    /// Absolute value of [`Timeline::get_last_record_lsn`].
+    ///
+    /// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn
+    const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: Name::WrittenSize,
+        }
+        .absolute_values()
+    }
+
+    /// Values will be the difference of the latest [`MetricsKey::written_size`] to what we
+    /// previously sent, starting from the previously sent incremental time range ending at the
+    /// latest absolute measurement.
+    const fn written_size_delta(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> IncrementalValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: Name::WrittenSizeDelta,
+        }
+        .incremental_values()
+    }
+
+    /// Exact [`Timeline::get_current_logical_size`].
+    ///
+    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
+    const fn timeline_logical_size(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: Name::LogicalSize,
+        }
+        .absolute_values()
+    }
+
+    /// [`Tenant::remote_size`]
+    ///
+    /// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
+    const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: Name::RemoteSize,
+        }
+        .absolute_values()
+    }
+
+    /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
+    ///
+    /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
+    const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: Name::ResidentSize,
+        }
+        .absolute_values()
+    }
+
+    /// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
+    ///
+    /// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
+    /// [`calculate_synthetic_size_worker`]: super::calculate_synthetic_size_worker
+    const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: Name::SyntheticSize,
+        }
+        .absolute_values()
+    }
+}
+
+pub(super) async fn collect_all_metrics(
+    cached_metrics: &Cache,
+    ctx: &RequestContext,
+) -> Vec<RawMetric> {
+    use pageserver_api::models::TenantState;
+
+    let started_at = std::time::Instant::now();
+
+    let tenants = match crate::tenant::mgr::list_tenants().await {
+        Ok(tenants) => tenants,
+        Err(err) => {
+            tracing::error!("failed to list tenants: {:?}", err);
+            return vec![];
+        }
+    };
+
+    let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move {
+        if state != TenantState::Active {
+            None
+        } else {
+            crate::tenant::mgr::get_tenant(id, true)
+                .await
+                .ok()
+                .map(|tenant| (id, tenant))
+        }
+    });
+
+    let res = collect(tenants, cached_metrics, ctx).await;
+
+    tracing::info!(
+        elapsed_ms = started_at.elapsed().as_millis(),
+        total = res.len(),
+        "collected metrics"
+    );
+
+    res
+}
+
+async fn collect<S>(tenants: S, cache: &Cache, ctx: &RequestContext) -> Vec<RawMetric>
+where
+    S: futures::stream::Stream<Item = (TenantId, Arc<crate::tenant::Tenant>)>,
+{
+    let mut current_metrics: Vec<RawMetric> = Vec::new();
+
+    let mut tenants = std::pin::pin!(tenants);
+
+    while let Some((tenant_id, tenant)) = tenants.next().await {
+        let mut tenant_resident_size = 0;
+
+        for timeline in tenant.list_timelines() {
+            let timeline_id = timeline.timeline_id;
+
+            match TimelineSnapshot::collect(&timeline, ctx) {
+                Ok(Some(snap)) => {
+                    snap.to_metrics(
+                        tenant_id,
+                        timeline_id,
+                        Utc::now(),
+                        &mut current_metrics,
+                        cache,
+                    );
+                }
+                Ok(None) => {}
+                Err(e) => {
+                    tracing::error!(
+                        "failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}",
+                        timeline.timeline_id
+                    );
+                    continue;
+                }
+            }
+
+            tenant_resident_size += timeline.resident_physical_size();
+        }
+
+        let snap = TenantSnapshot::collect(&tenant, tenant_resident_size);
+        snap.to_metrics(tenant_id, Utc::now(), cache, &mut current_metrics);
+    }
+
+    current_metrics
+}
+
+/// In-between abstraction to allow testing metrics without actual Tenants.
+struct TenantSnapshot {
+    resident_size: u64,
+    remote_size: u64,
+    synthetic_size: u64,
+}
+
+impl TenantSnapshot {
+    /// Collect tenant status to have metrics created out of it.
+    ///
+    /// `resident_size` is calculated of the timelines we had access to for other metrics, so we
+    /// cannot just list timelines here.
+    fn collect(t: &Arc<crate::tenant::Tenant>, resident_size: u64) -> Self {
+        TenantSnapshot {
+            resident_size,
+            remote_size: t.remote_size(),
+            // Note that this metric is calculated in a separate bgworker
+            // Here we only use cached value, which may lag behind the real latest one
+            synthetic_size: t.cached_synthetic_size(),
+        }
+    }
+
+    fn to_metrics(
+        &self,
+        tenant_id: TenantId,
+        now: DateTime<Utc>,
+        cached: &Cache,
+        metrics: &mut Vec<RawMetric>,
+    ) {
+        let remote_size = MetricsKey::remote_storage_size(tenant_id).at(now, self.remote_size);
+
+        let resident_size = MetricsKey::resident_size(tenant_id).at(now, self.resident_size);
+
+        let synthetic_size = {
+            let factory = MetricsKey::synthetic_size(tenant_id);
+            let mut synthetic_size = self.synthetic_size;
+
+            if synthetic_size == 0 {
+                if let Some((_, value)) = cached.get(factory.key()) {
+                    // use the latest value from previous session
+                    synthetic_size = *value;
+                }
+            }
+
+            if synthetic_size != 0 {
+                // only send non-zeroes because otherwise these show up as errors in logs
+                Some(factory.at(now, synthetic_size))
+            } else {
+                None
+            }
+        };
+
+        metrics.extend(
+            [Some(remote_size), Some(resident_size), synthetic_size]
+                .into_iter()
+                .flatten(),
+        );
+    }
+}
+
+/// Internal type to make timeline metric production testable.
+///
+/// As this value type contains all of the information needed from a timeline to produce the
+/// metrics, it can easily be created with different values in test.
+struct TimelineSnapshot {
+    loaded_at: (Lsn, SystemTime),
+    last_record_lsn: Lsn,
+    current_exact_logical_size: Option<u64>,
+}
+
+impl TimelineSnapshot {
+    /// Collect the metrics from an actual timeline.
+    ///
+    /// Fails currently only when [`Timeline::get_current_logical_size`] fails.
+    ///
+    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
+    fn collect(
+        t: &Arc<crate::tenant::Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Option<Self>> {
+        if !t.is_active() {
+            // no collection for broken or stopping needed, we will still keep the cached values
+            // though at the caller.
+            Ok(None)
+        } else {
+            let loaded_at = t.loaded_at;
+            let last_record_lsn = t.get_last_record_lsn();
+
+            let current_exact_logical_size = {
+                let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
+                let res = span
+                    .in_scope(|| t.get_current_logical_size(ctx))
+                    .context("get_current_logical_size");
+                match res? {
+                    // Only send timeline logical size when it is fully calculated.
+                    (size, is_exact) if is_exact => Some(size),
+                    (_, _) => None,
+                }
+            };
+
+            Ok(Some(TimelineSnapshot {
+                loaded_at,
+                last_record_lsn,
+                current_exact_logical_size,
+            }))
+        }
+    }
+
+    /// Produce the timeline consumption metrics into the `metrics` argument.
+    fn to_metrics(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        now: DateTime<Utc>,
+        metrics: &mut Vec<RawMetric>,
+        cache: &Cache,
+    ) {
+        let timeline_written_size = u64::from(self.last_record_lsn);
+
+        let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
+
+        let last_stop_time = cache
+            .get(written_size_delta_key.key())
+            .map(|(until, _val)| {
+                until
+                    .incremental_timerange()
+                    .expect("never create EventType::Absolute for written_size_delta")
+                    .end
+            });
+
+        let (key, written_size_now) =
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
+
+        // by default, use the last sent written_size as the basis for
+        // calculating the delta. if we don't yet have one, use the load time value.
+        let prev = cache
+            .get(&key)
+            .map(|(prev_at, prev)| {
+                // use the prev time from our last incremental update, or default to latest
+                // absolute update on the first round.
+                let prev_at = prev_at
+                    .absolute_time()
+                    .expect("never create EventType::Incremental for written_size");
+                let prev_at = last_stop_time.unwrap_or(prev_at);
+                (*prev_at, *prev)
+            })
+            .unwrap_or_else(|| {
+                // if we don't have a previous point of comparison, compare to the load time
+                // lsn.
+                let (disk_consistent_lsn, loaded_at) = &self.loaded_at;
+                (DateTime::from(*loaded_at), disk_consistent_lsn.0)
+            });
+
+        let up_to = now;
+
+        if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
+            let key_value = written_size_delta_key.from_until(prev.0, up_to, delta);
+            // written_size_delta
+            metrics.push(key_value);
+            // written_size
+            metrics.push((key, written_size_now));
+        } else {
+            // the cached value was ahead of us, report zero until we've caught up
+            metrics.push(written_size_delta_key.from_until(prev.0, up_to, 0));
+            // the cached value was ahead of us, report the same until we've caught up
+            metrics.push((key, (written_size_now.0, prev.1)));
+        }
+
+        {
+            let factory = MetricsKey::timeline_logical_size(tenant_id, timeline_id);
+            let current_or_previous = self
+                .current_exact_logical_size
+                .or_else(|| cache.get(factory.key()).map(|(_, val)| *val));
+
+            if let Some(size) = current_or_previous {
+                metrics.push(factory.at(now, size));
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests;
+
+#[cfg(test)]
+pub(crate) use tests::metric_examples;
--- a/pageserver/src/consumption_metrics/metrics/tests.rs
+++ b/pageserver/src/consumption_metrics/metrics/tests.rs
@@ -0,0 +1,297 @@
+use super::*;
+use std::collections::HashMap;
+use std::time::SystemTime;
+use utils::lsn::Lsn;
+
+#[test]
+fn startup_collected_timeline_metrics_before_advancing() {
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+
+    let mut metrics = Vec::new();
+    let cache = HashMap::new();
+
+    let initdb_lsn = Lsn(0x10000);
+    let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+    let snap = TimelineSnapshot {
+        loaded_at: (disk_consistent_lsn, SystemTime::now()),
+        last_record_lsn: disk_consistent_lsn,
+        current_exact_logical_size: Some(0x42000),
+    };
+
+    let now = DateTime::<Utc>::from(SystemTime::now());
+
+    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
+                snap.loaded_at.1.into(),
+                now,
+                0
+            ),
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+            MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+        ]
+    );
+}
+
+#[test]
+fn startup_collected_timeline_metrics_second_round() {
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+
+    let [now, before, init] = time_backwards();
+
+    let now = DateTime::<Utc>::from(now);
+    let before = DateTime::<Utc>::from(before);
+
+    let initdb_lsn = Lsn(0x10000);
+    let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+    let mut metrics = Vec::new();
+    let cache = HashMap::from([
+        MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
+    ]);
+
+    let snap = TimelineSnapshot {
+        loaded_at: (disk_consistent_lsn, init),
+        last_record_lsn: disk_consistent_lsn,
+        current_exact_logical_size: Some(0x42000),
+    };
+
+    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+            MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+        ]
+    );
+}
+
+#[test]
+fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+
+    let [now, just_before, before, init] = time_backwards();
+
+    let now = DateTime::<Utc>::from(now);
+    let just_before = DateTime::<Utc>::from(just_before);
+    let before = DateTime::<Utc>::from(before);
+
+    let initdb_lsn = Lsn(0x10000);
+    let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+    let mut metrics = Vec::new();
+    let cache = HashMap::from([
+        // at t=before was the last time the last_record_lsn changed
+        MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
+        // end time of this event is used for the next ones
+        MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, just_before, 0),
+    ]);
+
+    let snap = TimelineSnapshot {
+        loaded_at: (disk_consistent_lsn, init),
+        last_record_lsn: disk_consistent_lsn,
+        current_exact_logical_size: Some(0x42000),
+    };
+
+    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(just_before, now, 0),
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+            MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+        ]
+    );
+}
+
+#[test]
+fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
+    // it can happen that we lose the inmemorylayer but have previously sent metrics and we
+    // should never go backwards
+
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+
+    let [later, now, at_restart] = time_backwards();
+
+    // FIXME: tests would be so much easier if we did not need to juggle back and forth
+    // SystemTime and DateTime::<Utc> ... Could do the conversion only at upload time?
+    let now = DateTime::<Utc>::from(now);
+    let later = DateTime::<Utc>::from(later);
+    let before_restart = at_restart - std::time::Duration::from_secs(5 * 60);
+    let way_before = before_restart - std::time::Duration::from_secs(10 * 60);
+    let before_restart = DateTime::<Utc>::from(before_restart);
+    let way_before = DateTime::<Utc>::from(way_before);
+
+    let snap = TimelineSnapshot {
+        loaded_at: (Lsn(50), at_restart),
+        last_record_lsn: Lsn(50),
+        current_exact_logical_size: None,
+    };
+
+    let mut cache = HashMap::from([
+        MetricsKey::written_size(tenant_id, timeline_id).at(before_restart, 100),
+        MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
+            way_before,
+            before_restart,
+            // not taken into account, but the timestamps are important
+            999_999_999,
+        ),
+    ]);
+
+    let mut metrics = Vec::new();
+    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
+                before_restart,
+                now,
+                0
+            ),
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, 100),
+        ]
+    );
+
+    // now if we cache these metrics, and re-run while "still in recovery"
+    cache.extend(metrics.drain(..));
+
+    // "still in recovery", because our snapshot did not change
+    snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0),
+            MetricsKey::written_size(tenant_id, timeline_id).at(later, 100),
+        ]
+    );
+}
+
+#[test]
+fn post_restart_current_exact_logical_size_uses_cached() {
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+
+    let [now, at_restart] = time_backwards();
+
+    let now = DateTime::<Utc>::from(now);
+    let before_restart = at_restart - std::time::Duration::from_secs(5 * 60);
+    let before_restart = DateTime::<Utc>::from(before_restart);
+
+    let snap = TimelineSnapshot {
+        loaded_at: (Lsn(50), at_restart),
+        last_record_lsn: Lsn(50),
+        current_exact_logical_size: None,
+    };
+
+    let cache = HashMap::from([
+        MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(before_restart, 100)
+    ]);
+
+    let mut metrics = Vec::new();
+    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+    metrics.retain(|(key, _)| key.metric == Name::LogicalSize);
+
+    assert_eq!(
+        metrics,
+        &[MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 100)]
+    );
+}
+
+#[test]
+fn post_restart_synthetic_size_uses_cached_if_available() {
+    let tenant_id = TenantId::generate();
+
+    let ts = TenantSnapshot {
+        resident_size: 1000,
+        remote_size: 1000,
+        // not yet calculated
+        synthetic_size: 0,
+    };
+
+    let now = SystemTime::now();
+    let before_restart = DateTime::<Utc>::from(now - std::time::Duration::from_secs(5 * 60));
+    let now = DateTime::<Utc>::from(now);
+
+    let cached = HashMap::from([MetricsKey::synthetic_size(tenant_id).at(before_restart, 1000)]);
+
+    let mut metrics = Vec::new();
+    ts.to_metrics(tenant_id, now, &cached, &mut metrics);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::remote_storage_size(tenant_id).at(now, 1000),
+            MetricsKey::resident_size(tenant_id).at(now, 1000),
+            MetricsKey::synthetic_size(tenant_id).at(now, 1000),
+        ]
+    );
+}
+
+#[test]
+fn post_restart_synthetic_size_is_not_sent_when_not_cached() {
+    let tenant_id = TenantId::generate();
+
+    let ts = TenantSnapshot {
+        resident_size: 1000,
+        remote_size: 1000,
+        // not yet calculated
+        synthetic_size: 0,
+    };
+
+    let now = SystemTime::now();
+    let now = DateTime::<Utc>::from(now);
+
+    let cached = HashMap::new();
+
+    let mut metrics = Vec::new();
+    ts.to_metrics(tenant_id, now, &cached, &mut metrics);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::remote_storage_size(tenant_id).at(now, 1000),
+            MetricsKey::resident_size(tenant_id).at(now, 1000),
+            // no synthetic size here
+        ]
+    );
+}
+
+fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
+    let mut times = [std::time::SystemTime::UNIX_EPOCH; N];
+    times[0] = std::time::SystemTime::now();
+    for behind in 1..N {
+        times[behind] = times[0] - std::time::Duration::from_secs(behind as u64);
+    }
+
+    times
+}
+
+pub(crate) const fn metric_examples(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    now: DateTime<Utc>,
+    before: DateTime<Utc>,
+) -> [RawMetric; 6] {
+    [
+        MetricsKey::written_size(tenant_id, timeline_id).at(now, 0),
+        MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
+        MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0),
+        MetricsKey::remote_storage_size(tenant_id).at(now, 0),
+        MetricsKey::resident_size(tenant_id).at(now, 0),
+        MetricsKey::synthetic_size(tenant_id).at(now, 1),
+    ]
+}
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -0,0 +1,443 @@
+use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
+use serde_with::serde_as;
+use tokio_util::sync::CancellationToken;
+use tracing::Instrument;
+
+use super::{metrics::Name, Cache, MetricsKey, RawMetric};
+use utils::id::{TenantId, TimelineId};
+
+/// How the metrics from pageserver are identified.
+#[serde_with::serde_as]
+#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq)]
+struct Ids {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub(super) tenant_id: TenantId,
+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub(super) timeline_id: Option<TimelineId>,
+}
+
+#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
+pub(super) async fn upload_metrics(
+    client: &reqwest::Client,
+    metric_collection_endpoint: &reqwest::Url,
+    cancel: &CancellationToken,
+    node_id: &str,
+    metrics: &[RawMetric],
+    cached_metrics: &mut Cache,
+) -> anyhow::Result<()> {
+    let mut uploaded = 0;
+    let mut failed = 0;
+
+    let started_at = std::time::Instant::now();
+
+    let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, node_id);
+
+    while let Some(res) = iter.next() {
+        let (chunk, body) = res?;
+
+        let event_bytes = body.len();
+
+        let is_last = iter.len() == 0;
+
+        let res = upload(client, metric_collection_endpoint, body, cancel, is_last)
+            .instrument(tracing::info_span!(
+                "upload",
+                %event_bytes,
+                uploaded,
+                total = metrics.len(),
+            ))
+            .await;
+
+        match res {
+            Ok(()) => {
+                for (curr_key, curr_val) in chunk {
+                    cached_metrics.insert(*curr_key, *curr_val);
+                }
+                uploaded += chunk.len();
+            }
+            Err(_) => {
+                // failure(s) have already been logged
+                //
+                // however this is an inconsistency: if we crash here, we will start with the
+                // values as uploaded. in practice, the rejections no longer happen.
+                failed += chunk.len();
+            }
+        }
+    }
+
+    let elapsed = started_at.elapsed();
+
+    tracing::info!(
+        uploaded,
+        failed,
+        elapsed_ms = elapsed.as_millis(),
+        "done sending metrics"
+    );
+
+    Ok(())
+}
+
+// The return type is quite ugly, but we gain testability in isolation
+fn serialize_in_chunks<'a, F>(
+    chunk_size: usize,
+    input: &'a [RawMetric],
+    factory: F,
+) -> impl ExactSizeIterator<Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>> + 'a
+where
+    F: KeyGen<'a> + 'a,
+{
+    use bytes::BufMut;
+
+    struct Iter<'a, F> {
+        inner: std::slice::Chunks<'a, RawMetric>,
+        chunk_size: usize,
+
+        // write to a BytesMut so that we can cheaply clone the frozen Bytes for retries
+        buffer: bytes::BytesMut,
+        // chunk amount of events are reused to produce the serialized document
+        scratch: Vec<Event<Ids, Name>>,
+        factory: F,
+    }
+
+    impl<'a, F: KeyGen<'a>> Iterator for Iter<'a, F> {
+        type Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>;
+
+        fn next(&mut self) -> Option<Self::Item> {
+            let chunk = self.inner.next()?;
+
+            if self.scratch.is_empty() {
+                // first round: create events with N strings
+                self.scratch.extend(
+                    chunk
+                        .iter()
+                        .map(|raw_metric| raw_metric.as_event(&self.factory.generate())),
+                );
+            } else {
+                // next rounds: update_in_place to reuse allocations
+                assert_eq!(self.scratch.len(), self.chunk_size);
+                self.scratch
+                    .iter_mut()
+                    .zip(chunk.iter())
+                    .for_each(|(slot, raw_metric)| {
+                        raw_metric.update_in_place(slot, &self.factory.generate())
+                    });
+            }
+
+            let res = serde_json::to_writer(
+                (&mut self.buffer).writer(),
+                &EventChunk {
+                    events: (&self.scratch[..chunk.len()]).into(),
+                },
+            );
+
+            match res {
+                Ok(()) => Some(Ok((chunk, self.buffer.split().freeze()))),
+                Err(e) => Some(Err(e)),
+            }
+        }
+
+        fn size_hint(&self) -> (usize, Option<usize>) {
+            self.inner.size_hint()
+        }
+    }
+
+    impl<'a, F: KeyGen<'a>> ExactSizeIterator for Iter<'a, F> {}
+
+    let buffer = bytes::BytesMut::new();
+    let inner = input.chunks(chunk_size);
+    let scratch = Vec::new();
+
+    Iter {
+        inner,
+        chunk_size,
+        buffer,
+        scratch,
+        factory,
+    }
+}
+
+trait RawMetricExt {
+    fn as_event(&self, key: &IdempotencyKey<'_>) -> Event<Ids, Name>;
+    fn update_in_place(&self, event: &mut Event<Ids, Name>, key: &IdempotencyKey<'_>);
+}
+
+impl RawMetricExt for RawMetric {
+    fn as_event(&self, key: &IdempotencyKey<'_>) -> Event<Ids, Name> {
+        let MetricsKey {
+            metric,
+            tenant_id,
+            timeline_id,
+        } = self.0;
+
+        let (kind, value) = self.1;
+
+        Event {
+            kind,
+            metric,
+            idempotency_key: key.to_string(),
+            value,
+            extra: Ids {
+                tenant_id,
+                timeline_id,
+            },
+        }
+    }
+
+    fn update_in_place(&self, event: &mut Event<Ids, Name>, key: &IdempotencyKey<'_>) {
+        use std::fmt::Write;
+
+        let MetricsKey {
+            metric,
+            tenant_id,
+            timeline_id,
+        } = self.0;
+
+        let (kind, value) = self.1;
+
+        *event = Event {
+            kind,
+            metric,
+            idempotency_key: {
+                event.idempotency_key.clear();
+                write!(event.idempotency_key, "{key}").unwrap();
+                std::mem::take(&mut event.idempotency_key)
+            },
+            value,
+            extra: Ids {
+                tenant_id,
+                timeline_id,
+            },
+        };
+    }
+}
+
+trait KeyGen<'a>: Copy {
+    fn generate(&self) -> IdempotencyKey<'a>;
+}
+
+impl<'a> KeyGen<'a> for &'a str {
+    fn generate(&self) -> IdempotencyKey<'a> {
+        IdempotencyKey::generate(self)
+    }
+}
+
+enum UploadError {
+    Rejected(reqwest::StatusCode),
+    Reqwest(reqwest::Error),
+    Cancelled,
+}
+
+impl std::fmt::Debug for UploadError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // use same impl because backoff::retry will log this using both
+        std::fmt::Display::fmt(self, f)
+    }
+}
+
+impl std::fmt::Display for UploadError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        use UploadError::*;
+
+        match self {
+            Rejected(code) => write!(f, "server rejected the metrics with {code}"),
+            Reqwest(e) => write!(f, "request failed: {e}"),
+            Cancelled => write!(f, "cancelled"),
+        }
+    }
+}
+
+impl UploadError {
+    fn is_reject(&self) -> bool {
+        matches!(self, UploadError::Rejected(_))
+    }
+}
+
+// this is consumed by the test verifiers
+static LAST_IN_BATCH: reqwest::header::HeaderName =
+    reqwest::header::HeaderName::from_static("pageserver-metrics-last-upload-in-batch");
+
+async fn upload(
+    client: &reqwest::Client,
+    metric_collection_endpoint: &reqwest::Url,
+    body: bytes::Bytes,
+    cancel: &CancellationToken,
+    is_last: bool,
+) -> Result<(), UploadError> {
+    let warn_after = 3;
+    let max_attempts = 10;
+    let res = utils::backoff::retry(
+        move || {
+            let body = body.clone();
+            async move {
+                let res = client
+                    .post(metric_collection_endpoint.clone())
+                    .header(reqwest::header::CONTENT_TYPE, "application/json")
+                    .header(
+                        LAST_IN_BATCH.clone(),
+                        if is_last { "true" } else { "false" },
+                    )
+                    .body(body)
+                    .send()
+                    .await;
+
+                let res = res.and_then(|res| res.error_for_status());
+
+                // 10 redirects are normally allowed, so we don't need worry about 3xx
+                match res {
+                    Ok(_response) => Ok(()),
+                    Err(e) => {
+                        let status = e.status().filter(|s| s.is_client_error());
+                        if let Some(status) = status {
+                            // rejection used to be a thing when the server could reject a
+                            // whole batch of metrics if one metric was bad.
+                            Err(UploadError::Rejected(status))
+                        } else {
+                            Err(UploadError::Reqwest(e))
+                        }
+                    }
+                }
+            }
+        },
+        UploadError::is_reject,
+        warn_after,
+        max_attempts,
+        "upload consumption_metrics",
+        utils::backoff::Cancel::new(cancel.clone(), || UploadError::Cancelled),
+    )
+    .await;
+
+    match &res {
+        Ok(_) => {}
+        Err(e) if e.is_reject() => {
+            // permanent errors currently do not get logged by backoff::retry
+            // display alternate has no effect, but keeping it here for easier pattern matching.
+            tracing::error!("failed to upload metrics: {e:#}");
+        }
+        Err(_) => {
+            // these have been logged already
+        }
+    }
+
+    res
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use chrono::{DateTime, Utc};
+    use once_cell::sync::Lazy;
+
+    #[test]
+    fn chunked_serialization() {
+        let examples = metric_samples();
+        assert!(examples.len() > 1);
+
+        let factory = FixedGen::new(Utc::now(), "1", 42);
+
+        // need to use Event here because serde_json::Value uses default hashmap, not linked
+        // hashmap
+        #[derive(serde::Deserialize)]
+        struct EventChunk {
+            events: Vec<Event<Ids, Name>>,
+        }
+
+        let correct = serialize_in_chunks(examples.len(), &examples, factory)
+            .map(|res| res.unwrap().1)
+            .flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
+            .collect::<Vec<_>>();
+
+        for chunk_size in 1..examples.len() {
+            let actual = serialize_in_chunks(chunk_size, &examples, factory)
+                .map(|res| res.unwrap().1)
+                .flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
+                .collect::<Vec<_>>();
+
+            // if these are equal, it means that multi-chunking version works as well
+            assert_eq!(correct, actual);
+        }
+    }
+
+    #[derive(Clone, Copy)]
+    struct FixedGen<'a>(chrono::DateTime<chrono::Utc>, &'a str, u16);
+
+    impl<'a> FixedGen<'a> {
+        fn new(now: chrono::DateTime<chrono::Utc>, node_id: &'a str, nonce: u16) -> Self {
+            FixedGen(now, node_id, nonce)
+        }
+    }
+
+    impl<'a> KeyGen<'a> for FixedGen<'a> {
+        fn generate(&self) -> IdempotencyKey<'a> {
+            IdempotencyKey::for_tests(self.0, self.1, self.2)
+        }
+    }
+
+    static SAMPLES_NOW: Lazy<DateTime<Utc>> = Lazy::new(|| {
+        DateTime::parse_from_rfc3339("2023-09-15T00:00:00.123456789Z")
+            .unwrap()
+            .into()
+    });
+
+    #[test]
+    fn metric_image_stability() {
+        // it is important that these strings stay as they are
+
+        let examples = [
+            (
+                line!(),
+                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"written_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
+            ),
+            (
+                line!(),
+                r#"{"type":"incremental","start_time":"2023-09-14T00:00:00.123456789Z","stop_time":"2023-09-15T00:00:00.123456789Z","metric":"written_data_bytes_delta","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
+            ),
+            (
+                line!(),
+                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"timeline_logical_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
+            ),
+            (
+                line!(),
+                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"remote_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
+            ),
+            (
+                line!(),
+                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"resident_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
+            ),
+            (
+                line!(),
+                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"synthetic_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":1,"tenant_id":"00000000000000000000000000000000"}"#,
+            ),
+        ];
+
+        let idempotency_key = consumption_metrics::IdempotencyKey::for_tests(*SAMPLES_NOW, "1", 0);
+        let examples = examples.into_iter().zip(metric_samples());
+
+        for ((line, expected), (key, (kind, value))) in examples {
+            let e = consumption_metrics::Event {
+                kind,
+                metric: key.metric,
+                idempotency_key: idempotency_key.to_string(),
+                value,
+                extra: Ids {
+                    tenant_id: key.tenant_id,
+                    timeline_id: key.timeline_id,
+                },
+            };
+            let actual = serde_json::to_string(&e).unwrap();
+            assert_eq!(expected, actual, "example for {kind:?} from line {line}");
+        }
+    }
+
+    fn metric_samples() -> [RawMetric; 6] {
+        let tenant_id = TenantId::from_array([0; 16]);
+        let timeline_id = TimelineId::from_array([0xff; 16]);
+
+        let before = DateTime::parse_from_rfc3339("2023-09-14T00:00:00.123456789Z")
+            .unwrap()
+            .into();
+        let [now, before] = [*SAMPLES_NOW, before];
+
+        super::super::metrics::metric_examples(tenant_id, timeline_id, now, before)
+    }
+}
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -94,6 +94,18 @@ pub struct RequestContext {
    task_kind: TaskKind,
    download_behavior: DownloadBehavior,
    access_stats_behavior: AccessStatsBehavior,
+    page_content_kind: PageContentKind,
+}
+
+/// The kind of access to the page cache.
+#[derive(Clone, Copy, PartialEq, Eq, Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
+pub enum PageContentKind {
+    Unknown,
+    DeltaLayerBtreeNode,
+    DeltaLayerValue,
+    ImageLayerBtreeNode,
+    ImageLayerValue,
+    InMemoryLayer,
 }

 /// Desired behavior if the operation requires an on-demand download
@@ -137,6 +149,7 @@ impl RequestContextBuilder {
                task_kind,
                download_behavior: DownloadBehavior::Download,
                access_stats_behavior: AccessStatsBehavior::Update,
+                page_content_kind: PageContentKind::Unknown,
            },
        }
    }
@@ -149,6 +162,7 @@ impl RequestContextBuilder {
                task_kind: original.task_kind,
                download_behavior: original.download_behavior,
                access_stats_behavior: original.access_stats_behavior,
+                page_content_kind: original.page_content_kind,
            },
        }
    }
@@ -167,6 +181,11 @@ impl RequestContextBuilder {
        self
    }

+    pub(crate) fn page_content_kind(mut self, k: PageContentKind) -> Self {
+        self.inner.page_content_kind = k;
+        self
+    }
+
    pub fn build(self) -> RequestContext {
        self.inner
    }
@@ -263,4 +282,8 @@ impl RequestContext {
    pub(crate) fn access_stats_behavior(&self) -> AccessStatsBehavior {
        self.access_stats_behavior
    }
+
+    pub(crate) fn page_content_kind(&self) -> PageContentKind {
+        self.page_content_kind
+    }
 }
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -1,7 +1,9 @@
 use std::collections::HashMap;

-use hyper::StatusCode;
-use pageserver_api::control_api::{ReAttachRequest, ReAttachResponse};
+use pageserver_api::control_api::{
+    ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
+};
+use serde::{de::DeserializeOwned, Serialize};
 use tokio_util::sync::CancellationToken;
 use url::Url;
 use utils::{
@@ -12,25 +14,34 @@ use utils::{

 use crate::config::PageServerConf;

-// Backoffs when control plane requests do not succeed: compromise between reducing load
-// on control plane, and retrying frequently when we are blocked on a control plane
-// response to make progress.
-const BACKOFF_INCREMENT: f64 = 0.1;
-const BACKOFF_MAX: f64 = 10.0;
-
 /// The Pageserver's client for using the control plane API: this is a small subset
 /// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
-pub(crate) struct ControlPlaneClient {
+pub struct ControlPlaneClient {
    http_client: reqwest::Client,
    base_url: Url,
    node_id: NodeId,
    cancel: CancellationToken,
 }

+/// Represent operations which internally retry on all errors other than
+/// cancellation token firing: the only way they can fail is ShuttingDown.
+pub enum RetryForeverError {
+    ShuttingDown,
+}
+
+#[async_trait::async_trait]
+pub trait ControlPlaneGenerationsApi {
+    async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError>;
+    async fn validate(
+        &self,
+        tenants: Vec<(TenantId, Generation)>,
+    ) -> Result<HashMap<TenantId, bool>, RetryForeverError>;
+}
+
 impl ControlPlaneClient {
    /// A None return value indicates that the input `conf` object does not have control
    /// plane API enabled.
-    pub(crate) fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
+    pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
        let mut url = match conf.control_plane_api.as_ref() {
            Some(u) => u.clone(),
            None => return None,
@@ -42,39 +53,78 @@ impl ControlPlaneClient {
            segs.pop_if_empty().push("");
        }

-        let client = reqwest::ClientBuilder::new()
-            .build()
-            .expect("Failed to construct http client");
+        let mut client = reqwest::ClientBuilder::new();
+
+        if let Some(jwt) = &conf.control_plane_api_token {
+            let mut headers = hyper::HeaderMap::new();
+            headers.insert("Authorization", jwt.get_contents().parse().unwrap());
+            client = client.default_headers(headers);
+        }

        Some(Self {
-            http_client: client,
+            http_client: client.build().expect("Failed to construct HTTP client"),
            base_url: url,
            node_id: conf.id,
            cancel: cancel.clone(),
        })
    }

-    async fn try_re_attach(
+    async fn retry_http_forever<R, T>(
        &self,
-        url: Url,
-        request: &ReAttachRequest,
-    ) -> anyhow::Result<ReAttachResponse> {
-        match self.http_client.post(url).json(request).send().await {
-            Err(e) => Err(anyhow::Error::from(e)),
-            Ok(r) => {
-                if r.status() == StatusCode::OK {
-                    r.json::<ReAttachResponse>()
-                        .await
-                        .map_err(anyhow::Error::from)
-                } else {
-                    Err(anyhow::anyhow!("Unexpected status {}", r.status()))
-                }
+        url: &url::Url,
+        request: R,
+    ) -> Result<T, RetryForeverError>
+    where
+        R: Serialize,
+        T: DeserializeOwned,
+    {
+        #[derive(thiserror::Error, Debug)]
+        enum RemoteAttemptError {
+            #[error("shutdown")]
+            Shutdown,
+            #[error("remote: {0}")]
+            Remote(reqwest::Error),
+        }
+
+        match backoff::retry(
+            || async {
+                let response = self
+                    .http_client
+                    .post(url.clone())
+                    .json(&request)
+                    .send()
+                    .await
+                    .map_err(RemoteAttemptError::Remote)?;
+
+                response
+                    .error_for_status_ref()
+                    .map_err(RemoteAttemptError::Remote)?;
+                response
+                    .json::<T>()
+                    .await
+                    .map_err(RemoteAttemptError::Remote)
+            },
+            |_| false,
+            3,
+            u32::MAX,
+            "calling control plane generation validation API",
+            backoff::Cancel::new(self.cancel.clone(), || RemoteAttemptError::Shutdown),
+        )
+        .await
+        {
+            Err(RemoteAttemptError::Shutdown) => Err(RetryForeverError::ShuttingDown),
+            Err(RemoteAttemptError::Remote(_)) => {
+                panic!("We retry forever, this should never be reached");
            }
+            Ok(r) => Ok(r),
        }
    }
+}

-    /// Block until we get a successful response
-    pub(crate) async fn re_attach(&self) -> anyhow::Result<HashMap<TenantId, Generation>> {
+#[async_trait::async_trait]
+impl ControlPlaneGenerationsApi for ControlPlaneClient {
+    /// Block until we get a successful response, or error out if we are shut down
+    async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
        let re_attach_path = self
            .base_url
            .join("re-attach")
@@ -83,37 +133,47 @@ impl ControlPlaneClient {
            node_id: self.node_id,
        };

-        let mut attempt = 0;
-        loop {
-            let result = self.try_re_attach(re_attach_path.clone(), &request).await;
-            match result {
-                Ok(res) => {
-                    tracing::info!(
-                        "Received re-attach response with {} tenants",
-                        res.tenants.len()
-                    );
+        let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
+        tracing::info!(
+            "Received re-attach response with {} tenants",
+            response.tenants.len()
+        );

-                    return Ok(res
-                        .tenants
-                        .into_iter()
-                        .map(|t| (t.id, Generation::new(t.generation)))
-                        .collect::<HashMap<_, _>>());
-                }
-                Err(e) => {
-                    tracing::error!("Error re-attaching tenants, retrying: {e:#}");
-                    backoff::exponential_backoff(
-                        attempt,
-                        BACKOFF_INCREMENT,
-                        BACKOFF_MAX,
-                        &self.cancel,
-                    )
-                    .await;
-                    if self.cancel.is_cancelled() {
-                        return Err(anyhow::anyhow!("Shutting down"));
-                    }
-                    attempt += 1;
-                }
-            }
-        }
+        Ok(response
+            .tenants
+            .into_iter()
+            .map(|t| (t.id, Generation::new(t.generation)))
+            .collect::<HashMap<_, _>>())
+    }
+
+    /// Block until we get a successful response, or error out if we are shut down
+    async fn validate(
+        &self,
+        tenants: Vec<(TenantId, Generation)>,
+    ) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
+        let re_attach_path = self
+            .base_url
+            .join("validate")
+            .expect("Failed to build validate path");
+
+        let request = ValidateRequest {
+            tenants: tenants
+                .into_iter()
+                .map(|(id, gen)| ValidateRequestTenant {
+                    id,
+                    gen: gen
+                        .into()
+                        .expect("Generation should always be valid for a Tenant doing deletions"),
+                })
+                .collect(),
+        };
+
+        let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
+
+        Ok(response
+            .tenants
+            .into_iter()
+            .map(|rt| (rt.id, rt.valid))
+            .collect())
    }
 }
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -0,0 +1,156 @@
+//! The deleter is the final stage in the deletion queue.  It accumulates remote
+//! paths to delete, and periodically executes them in batches of up to 1000
+//! using the DeleteObjects request.
+//!
+//! Its purpose is to increase efficiency of remote storage I/O by issuing a smaller
+//! number of full-sized DeleteObjects requests, rather than a larger number of
+//! smaller requests.
+
+use remote_storage::GenericRemoteStorage;
+use remote_storage::RemotePath;
+use remote_storage::MAX_KEYS_PER_DELETE;
+use std::time::Duration;
+use tokio_util::sync::CancellationToken;
+use tracing::info;
+use tracing::warn;
+
+use crate::metrics;
+
+use super::DeletionQueueError;
+use super::FlushOp;
+
+const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
+
+pub(super) enum DeleterMessage {
+    Delete(Vec<RemotePath>),
+    Flush(FlushOp),
+}
+
+/// Non-persistent deletion queue, for coalescing multiple object deletes into
+/// larger DeleteObjects requests.
+pub(super) struct Deleter {
+    // Accumulate up to 1000 keys for the next deletion operation
+    accumulator: Vec<RemotePath>,
+
+    rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
+
+    cancel: CancellationToken,
+    remote_storage: GenericRemoteStorage,
+}
+
+impl Deleter {
+    pub(super) fn new(
+        remote_storage: GenericRemoteStorage,
+        rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
+        cancel: CancellationToken,
+    ) -> Self {
+        Self {
+            remote_storage,
+            rx,
+            cancel,
+            accumulator: Vec::new(),
+        }
+    }
+
+    /// Wrap the remote `delete_objects` with a failpoint
+    async fn remote_delete(&self) -> Result<(), anyhow::Error> {
+        fail::fail_point!("deletion-queue-before-execute", |_| {
+            info!("Skipping execution, failpoint set");
+            metrics::DELETION_QUEUE
+                .remote_errors
+                .with_label_values(&["failpoint"])
+                .inc();
+            Err(anyhow::anyhow!("failpoint hit"))
+        });
+
+        self.remote_storage.delete_objects(&self.accumulator).await
+    }
+
+    /// Block until everything in accumulator has been executed
+    async fn flush(&mut self) -> Result<(), DeletionQueueError> {
+        while !self.accumulator.is_empty() && !self.cancel.is_cancelled() {
+            match self.remote_delete().await {
+                Ok(()) => {
+                    // Note: we assume that the remote storage layer returns Ok(()) if some
+                    // or all of the deleted objects were already gone.
+                    metrics::DELETION_QUEUE
+                        .keys_executed
+                        .inc_by(self.accumulator.len() as u64);
+                    info!(
+                        "Executed deletion batch {}..{}",
+                        self.accumulator
+                            .first()
+                            .expect("accumulator should be non-empty"),
+                        self.accumulator
+                            .last()
+                            .expect("accumulator should be non-empty"),
+                    );
+                    self.accumulator.clear();
+                }
+                Err(e) => {
+                    warn!("DeleteObjects request failed: {e:#}, will retry");
+                    metrics::DELETION_QUEUE
+                        .remote_errors
+                        .with_label_values(&["execute"])
+                        .inc();
+                }
+            };
+        }
+        if self.cancel.is_cancelled() {
+            // Expose an error because we may not have actually flushed everything
+            Err(DeletionQueueError::ShuttingDown)
+        } else {
+            Ok(())
+        }
+    }
+
+    pub(super) async fn background(&mut self) -> Result<(), DeletionQueueError> {
+        self.accumulator.reserve(MAX_KEYS_PER_DELETE);
+
+        loop {
+            if self.cancel.is_cancelled() {
+                return Err(DeletionQueueError::ShuttingDown);
+            }
+
+            let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
+                Ok(Some(m)) => m,
+                Ok(None) => {
+                    // All queue senders closed
+                    info!("Shutting down");
+                    return Err(DeletionQueueError::ShuttingDown);
+                }
+                Err(_) => {
+                    // Timeout, we hit deadline to execute whatever we have in hand.  These functions will
+                    // return immediately if no work is pending
+                    self.flush().await?;
+
+                    continue;
+                }
+            };
+
+            match msg {
+                DeleterMessage::Delete(mut list) => {
+                    while !list.is_empty() || self.accumulator.len() == MAX_KEYS_PER_DELETE {
+                        if self.accumulator.len() == MAX_KEYS_PER_DELETE {
+                            self.flush().await?;
+                            // If we have received this number of keys, proceed with attempting to execute
+                            assert_eq!(self.accumulator.len(), 0);
+                        }
+
+                        let available_slots = MAX_KEYS_PER_DELETE - self.accumulator.len();
+                        let take_count = std::cmp::min(available_slots, list.len());
+                        for path in list.drain(list.len() - take_count..) {
+                            self.accumulator.push(path);
+                        }
+                    }
+                }
+                DeleterMessage::Flush(flush_op) => {
+                    // If flush() errors, we drop the flush_op and the caller will get
+                    // an error recv()'ing their oneshot channel.
+                    self.flush().await?;
+                    flush_op.notify();
+                }
+            }
+        }
+    }
+}
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -0,0 +1,487 @@
+//! The list writer is the first stage in the deletion queue.  It accumulates
+//! layers to delete, and periodically writes out these layers into a persistent
+//! DeletionList.
+//!
+//! The purpose of writing DeletionLists is to decouple the decision to
+//! delete an object from the validation required to execute it: even if
+//! validation is not possible, e.g. due to a control plane outage, we can
+//! still persist our intent to delete an object, in a way that would
+//! survive a restart.
+//!
+//! DeletionLists are passed onwards to the Validator.
+
+use super::DeletionHeader;
+use super::DeletionList;
+use super::FlushOp;
+use super::ValidatorQueueMessage;
+
+use std::collections::HashMap;
+use std::fs::create_dir_all;
+use std::time::Duration;
+
+use regex::Regex;
+use remote_storage::RemotePath;
+use tokio_util::sync::CancellationToken;
+use tracing::debug;
+use tracing::info;
+use tracing::warn;
+use utils::generation::Generation;
+use utils::id::TenantId;
+use utils::id::TimelineId;
+
+use crate::config::PageServerConf;
+use crate::deletion_queue::TEMP_SUFFIX;
+use crate::metrics;
+use crate::tenant::remote_timeline_client::remote_layer_path;
+use crate::tenant::storage_layer::LayerFileName;
+
+// The number of keys in a DeletionList before we will proactively persist it
+// (without reaching a flush deadline).  This aims to deliver objects of the order
+// of magnitude 1MB when we are under heavy delete load.
+const DELETION_LIST_TARGET_SIZE: usize = 16384;
+
+// Ordinarily, we only flush to DeletionList periodically, to bound the window during
+// which we might leak objects from not flushing a DeletionList after
+// the objects are already unlinked from timeline metadata.
+const FRONTEND_DEFAULT_TIMEOUT: Duration = Duration::from_millis(10000);
+
+// If someone is waiting for a flush to DeletionList, only delay a little to accumulate
+// more objects before doing the flush.
+const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100);
+
+#[derive(Debug)]
+pub(super) struct DeletionOp {
+    pub(super) tenant_id: TenantId,
+    pub(super) timeline_id: TimelineId,
+    // `layers` and `objects` are both just lists of objects.  `layers` is used if you do not
+    // have a config object handy to project it to a remote key, and need the consuming worker
+    // to do it for you.
+    pub(super) layers: Vec<(LayerFileName, Generation)>,
+    pub(super) objects: Vec<RemotePath>,
+
+    /// The _current_ generation of the Tenant attachment in which we are enqueuing
+    /// this deletion.
+    pub(super) generation: Generation,
+}
+
+#[derive(Debug)]
+pub(super) struct RecoverOp {
+    pub(super) attached_tenants: HashMap<TenantId, Generation>,
+}
+
+#[derive(Debug)]
+pub(super) enum ListWriterQueueMessage {
+    Delete(DeletionOp),
+    // Wait until all prior deletions make it into a persistent DeletionList
+    Flush(FlushOp),
+    // Wait until all prior deletions have been executed (i.e. objects are actually deleted)
+    FlushExecute(FlushOp),
+    // Call once after re-attaching to control plane, to notify the deletion queue about
+    // latest attached generations & load any saved deletion lists from disk.
+    Recover(RecoverOp),
+}
+
+pub(super) struct ListWriter {
+    conf: &'static PageServerConf,
+
+    // Incoming frontend requests to delete some keys
+    rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+
+    // Outbound requests to the backend to execute deletion lists we have composed.
+    tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
+
+    // The list we are currently building, contains a buffer of keys to delete
+    // and our next sequence number
+    pending: DeletionList,
+
+    // These FlushOps should notify the next time we flush
+    pending_flushes: Vec<FlushOp>,
+
+    // Worker loop is torn down when this fires.
+    cancel: CancellationToken,
+
+    // Safety guard to do recovery exactly once
+    recovered: bool,
+}
+
+impl ListWriter {
+    // Initially DeletionHeader.validated_sequence is zero.  The place we start our
+    // sequence numbers must be higher than that.
+    const BASE_SEQUENCE: u64 = 1;
+
+    pub(super) fn new(
+        conf: &'static PageServerConf,
+        rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+        tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
+        cancel: CancellationToken,
+    ) -> Self {
+        Self {
+            pending: DeletionList::new(Self::BASE_SEQUENCE),
+            conf,
+            rx,
+            tx,
+            pending_flushes: Vec::new(),
+            cancel,
+            recovered: false,
+        }
+    }
+
+    /// Try to flush `list` to persistent storage
+    ///
+    /// This does not return errors, because on failure to flush we do not lose
+    /// any state: flushing will be retried implicitly on the next deadline
+    async fn flush(&mut self) {
+        if self.pending.is_empty() {
+            for f in self.pending_flushes.drain(..) {
+                f.notify();
+            }
+            return;
+        }
+
+        match self.pending.save(self.conf).await {
+            Ok(_) => {
+                info!(sequence = self.pending.sequence, "Stored deletion list");
+
+                for f in self.pending_flushes.drain(..) {
+                    f.notify();
+                }
+
+                // Take the list we've accumulated, replace it with a fresh list for the next sequence
+                let next_list = DeletionList::new(self.pending.sequence + 1);
+                let list = std::mem::replace(&mut self.pending, next_list);
+
+                if let Err(e) = self.tx.send(ValidatorQueueMessage::Delete(list)).await {
+                    // This is allowed to fail: it will only happen if the backend worker is shut down,
+                    // so we can just drop this on the floor.
+                    info!("Deletion list dropped, this is normal during shutdown ({e:#})");
+                }
+            }
+            Err(e) => {
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
+                warn!(
+                    sequence = self.pending.sequence,
+                    "Failed to write deletion list, will retry later ({e:#})"
+                );
+            }
+        }
+    }
+
+    /// Load the header, to learn the sequence number up to which deletions
+    /// have been validated.  We will apply validated=true to DeletionLists
+    /// <= this sequence when loading them.
+    ///
+    /// It is not an error for the header to not exist: we return None, and
+    /// the caller should act as if validated_sequence is 0
+    async fn load_validated_sequence(&self) -> Result<Option<u64>, anyhow::Error> {
+        let header_path = self.conf.deletion_header_path();
+        match tokio::fs::read(&header_path).await {
+            Ok(header_bytes) => {
+                match serde_json::from_slice::<DeletionHeader>(&header_bytes) {
+                    Ok(h) => Ok(Some(h.validated_sequence)),
+                    Err(e) => {
+                        warn!(
+                            "Failed to deserialize deletion header, ignoring {}: {e:#}",
+                            header_path.display()
+                        );
+                        // This should never happen unless we make a mistake with our serialization.
+                        // Ignoring a deletion header is not consequential for correctnes because all deletions
+                        // are ultimately allowed to fail: worst case we leak some objects for the scrubber to clean up.
+                        metrics::DELETION_QUEUE.unexpected_errors.inc();
+                        Ok(None)
+                    }
+                }
+            }
+            Err(e) => {
+                if e.kind() == std::io::ErrorKind::NotFound {
+                    debug!(
+                        "Deletion header {} not found, first start?",
+                        header_path.display()
+                    );
+                    Ok(None)
+                } else {
+                    Err(anyhow::anyhow!(e))
+                }
+            }
+        }
+    }
+
+    async fn recover(
+        &mut self,
+        attached_tenants: HashMap<TenantId, Generation>,
+    ) -> Result<(), anyhow::Error> {
+        debug!(
+            "recovering with {} attached tenants",
+            attached_tenants.len()
+        );
+
+        // Load the header
+        let validated_sequence = self.load_validated_sequence().await?.unwrap_or(0);
+
+        self.pending.sequence = validated_sequence + 1;
+
+        let deletion_directory = self.conf.deletion_prefix();
+        let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
+            Ok(d) => d,
+            Err(e) => {
+                warn!(
+                    "Failed to open deletion list directory {}: {e:#}",
+                    deletion_directory.display(),
+                );
+
+                // Give up: if we can't read the deletion list directory, we probably can't
+                // write lists into it later, so the queue won't work.
+                return Err(e.into());
+            }
+        };
+
+        let list_name_pattern =
+            Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
+
+        let header_path = self.conf.deletion_header_path();
+        let mut seqs: Vec<u64> = Vec::new();
+        while let Some(dentry) = dir.next_entry().await? {
+            let file_name = dentry.file_name();
+            let dentry_str = file_name.to_string_lossy();
+
+            if Some(file_name.as_os_str()) == header_path.file_name() {
+                // Don't try and parse the header's name like a list
+                continue;
+            }
+
+            if dentry_str.ends_with(TEMP_SUFFIX) {
+                info!("Cleaning up temporary file {dentry_str}");
+                let absolute_path = deletion_directory.join(dentry.file_name());
+                if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
+                    // Non-fatal error: we will just leave the file behind but not
+                    // try and load it.
+                    warn!(
+                        "Failed to clean up temporary file {}: {e:#}",
+                        absolute_path.display()
+                    );
+                }
+
+                continue;
+            }
+
+            let file_name = dentry.file_name().to_owned();
+            let basename = file_name.to_string_lossy();
+            let seq_part = if let Some(m) = list_name_pattern.captures(&basename) {
+                m.name("sequence")
+                    .expect("Non optional group should be present")
+                    .as_str()
+            } else {
+                warn!("Unexpected key in deletion queue: {basename}");
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
+                continue;
+            };
+
+            let seq: u64 = match u64::from_str_radix(seq_part, 16) {
+                Ok(s) => s,
+                Err(e) => {
+                    warn!("Malformed key '{basename}': {e}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
+                    continue;
+                }
+            };
+            seqs.push(seq);
+        }
+        seqs.sort();
+
+        // Start our next deletion list from after the last location validated by
+        // previous process lifetime, or after the last location found (it is updated
+        // below after enumerating the deletion lists)
+        self.pending.sequence = validated_sequence + 1;
+        if let Some(max_list_seq) = seqs.last() {
+            self.pending.sequence = std::cmp::max(self.pending.sequence, max_list_seq + 1);
+        }
+
+        for s in seqs {
+            let list_path = self.conf.deletion_list_path(s);
+
+            let list_bytes = tokio::fs::read(&list_path).await?;
+
+            let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
+                Ok(l) => l,
+                Err(e) => {
+                    // Drop the list on the floor: any objects it referenced will be left behind
+                    // for scrubbing to clean up.  This should never happen unless we have a serialization bug.
+                    warn!(sequence = s, "Failed to deserialize deletion list: {e}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
+                    continue;
+                }
+            };
+
+            if deletion_list.sequence <= validated_sequence {
+                // If the deletion list falls below valid_seq, we may assume that it was
+                // already validated the last time this pageserver ran.  Otherwise, we still
+                // load it, as it may still contain content valid in this generation.
+                deletion_list.validated = true;
+            } else {
+                // Special case optimization: if a tenant is still attached, and no other
+                // generation was issued to another node in the interval while we restarted,
+                // then we may treat deletion lists from the previous generation as if they
+                // belong to our currently attached generation, and proceed to validate & execute.
+                for (tenant_id, tenant_list) in &mut deletion_list.tenants {
+                    if let Some(attached_gen) = attached_tenants.get(tenant_id) {
+                        if attached_gen.previous() == tenant_list.generation {
+                            tenant_list.generation = *attached_gen;
+                        }
+                    }
+                }
+            }
+
+            info!(
+                validated = deletion_list.validated,
+                sequence = deletion_list.sequence,
+                "Recovered deletion list"
+            );
+
+            // We will drop out of recovery if this fails: it indicates that we are shutting down
+            // or the backend has panicked
+            metrics::DELETION_QUEUE
+                .keys_submitted
+                .inc_by(deletion_list.len() as u64);
+            self.tx
+                .send(ValidatorQueueMessage::Delete(deletion_list))
+                .await?;
+        }
+
+        info!(next_sequence = self.pending.sequence, "Replay complete");
+
+        Ok(())
+    }
+
+    /// This is the front-end ingest, where we bundle up deletion requests into DeletionList
+    /// and write them out, for later validation by the backend and execution by the executor.
+    pub(super) async fn background(&mut self) {
+        info!("Started deletion frontend worker");
+
+        // Synchronous, but we only do it once per process lifetime so it's tolerable
+        if let Err(e) = create_dir_all(&self.conf.deletion_prefix()) {
+            tracing::error!(
+                "Failed to create deletion list directory {}, deletions will not be executed ({e})",
+                self.conf.deletion_prefix().display()
+            );
+            metrics::DELETION_QUEUE.unexpected_errors.inc();
+            return;
+        }
+
+        while !self.cancel.is_cancelled() {
+            let timeout = if self.pending_flushes.is_empty() {
+                FRONTEND_DEFAULT_TIMEOUT
+            } else {
+                FRONTEND_FLUSHING_TIMEOUT
+            };
+
+            let msg = match tokio::time::timeout(timeout, self.rx.recv()).await {
+                Ok(Some(msg)) => msg,
+                Ok(None) => {
+                    // Queue sender destroyed, shutting down
+                    break;
+                }
+                Err(_) => {
+                    // Hit deadline, flush.
+                    self.flush().await;
+                    continue;
+                }
+            };
+
+            match msg {
+                ListWriterQueueMessage::Delete(op) => {
+                    assert!(
+                        self.recovered,
+                        "Cannot process deletions before recovery.  This is a bug."
+                    );
+
+                    debug!(
+                        "Delete: ingesting {} layers, {} other objects",
+                        op.layers.len(),
+                        op.objects.len()
+                    );
+
+                    let mut layer_paths = Vec::new();
+                    for (layer, generation) in op.layers {
+                        layer_paths.push(remote_layer_path(
+                            &op.tenant_id,
+                            &op.timeline_id,
+                            &layer,
+                            generation,
+                        ));
+                    }
+                    layer_paths.extend(op.objects);
+
+                    if !self.pending.push(
+                        &op.tenant_id,
+                        &op.timeline_id,
+                        op.generation,
+                        &mut layer_paths,
+                    ) {
+                        self.flush().await;
+                        let retry_succeeded = self.pending.push(
+                            &op.tenant_id,
+                            &op.timeline_id,
+                            op.generation,
+                            &mut layer_paths,
+                        );
+                        if !retry_succeeded {
+                            // Unexpected: after we flush, we should have
+                            // drained self.pending, so a conflict on
+                            // generation numbers should be impossible.
+                            tracing::error!(
+                                "Failed to enqueue deletions, leaking objects.  This is a bug."
+                            );
+                            metrics::DELETION_QUEUE.unexpected_errors.inc();
+                        }
+                    }
+                }
+                ListWriterQueueMessage::Flush(op) => {
+                    if self.pending.is_empty() {
+                        // Execute immediately
+                        debug!("Flush: No pending objects, flushing immediately");
+                        op.notify()
+                    } else {
+                        // Execute next time we flush
+                        debug!("Flush: adding to pending flush list for next deadline flush");
+                        self.pending_flushes.push(op);
+                    }
+                }
+                ListWriterQueueMessage::FlushExecute(op) => {
+                    debug!("FlushExecute: passing through to backend");
+                    // We do not flush to a deletion list here: the client sends a Flush before the FlushExecute
+                    if let Err(e) = self.tx.send(ValidatorQueueMessage::Flush(op)).await {
+                        info!("Can't flush, shutting down ({e})");
+                        // Caller will get error when their oneshot sender was dropped.
+                    }
+                }
+                ListWriterQueueMessage::Recover(op) => {
+                    if self.recovered {
+                        tracing::error!(
+                            "Deletion queue recovery called more than once.  This is a bug."
+                        );
+                        metrics::DELETION_QUEUE.unexpected_errors.inc();
+                        // Non-fatal: although this is a bug, since we did recovery at least once we may proceed.
+                        continue;
+                    }
+
+                    if let Err(e) = self.recover(op.attached_tenants).await {
+                        // This should only happen in truly unrecoverable cases, like the recovery finding that the backend
+                        // queue receiver has been dropped, or something is critically broken with
+                        // the local filesystem holding deletion lists.
+                        info!(
+                            "Deletion queue recover aborted, deletion queue will not proceed ({e})"
+                        );
+                        metrics::DELETION_QUEUE.unexpected_errors.inc();
+                        return;
+                    } else {
+                        self.recovered = true;
+                    }
+                }
+            }
+
+            if self.pending.len() > DELETION_LIST_TARGET_SIZE || !self.pending_flushes.is_empty() {
+                self.flush().await;
+            }
+        }
+        info!("Deletion queue shut down.");
+    }
+}
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -0,0 +1,414 @@
+//! The validator is responsible for validating DeletionLists for execution,
+//! based on whethe the generation in the DeletionList is still the latest
+//! generation for a tenant.
+//!
+//! The purpose of validation is to ensure split-brain safety in the cluster
+//! of pageservers: a deletion may only be executed if the tenant generation
+//! that originated it is still current.  See docs/rfcs/025-generation-numbers.md
+//! The purpose of accumulating lists before validating them is to reduce load
+//! on the control plane API by issuing fewer, larger requests.
+//!
+//! In addition to validating DeletionLists, the validator validates updates to remote_consistent_lsn
+//! for timelines: these are logically deletions because the safekeepers use remote_consistent_lsn
+//! to decide when old
+//!
+//! Deletions are passed onward to the Deleter.
+
+use std::collections::HashMap;
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::Duration;
+
+use tokio_util::sync::CancellationToken;
+use tracing::debug;
+use tracing::info;
+use tracing::warn;
+
+use crate::config::PageServerConf;
+use crate::control_plane_client::ControlPlaneGenerationsApi;
+use crate::control_plane_client::RetryForeverError;
+use crate::metrics;
+
+use super::deleter::DeleterMessage;
+use super::DeletionHeader;
+use super::DeletionList;
+use super::DeletionQueueError;
+use super::FlushOp;
+use super::VisibleLsnUpdates;
+
+// After this length of time, do any validation work that is pending,
+// even if we haven't accumulated many keys to delete.
+//
+// This also causes updates to remote_consistent_lsn to be validated, even
+// if there were no deletions enqueued.
+const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
+
+// If we have received this number of keys, proceed with attempting to execute
+const AUTOFLUSH_KEY_COUNT: usize = 16384;
+
+#[derive(Debug)]
+pub(super) enum ValidatorQueueMessage {
+    Delete(DeletionList),
+    Flush(FlushOp),
+}
+pub(super) struct Validator<C>
+where
+    C: ControlPlaneGenerationsApi,
+{
+    conf: &'static PageServerConf,
+    rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
+    tx: tokio::sync::mpsc::Sender<DeleterMessage>,
+
+    // Client for calling into control plane API for validation of deletes
+    control_plane_client: Option<C>,
+
+    // DeletionLists which are waiting generation validation.  Not safe to
+    // execute until [`validate`] has processed them.
+    pending_lists: Vec<DeletionList>,
+
+    // DeletionLists which have passed validation and are ready to execute.
+    validated_lists: Vec<DeletionList>,
+
+    // Sum of all the lengths of lists in pending_lists
+    pending_key_count: usize,
+
+    // Lsn validation state: we read projected LSNs and write back visible LSNs
+    // after validation.  This is the LSN equivalent of `pending_validation_lists`:
+    // it is drained in [`validate`]
+    lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
+
+    // If we failed to rewrite a deletion list due to local filesystem I/O failure,
+    // we must remember that and refuse to advance our persistent validated sequence
+    // number past the failure.
+    list_write_failed: Option<u64>,
+
+    cancel: CancellationToken,
+}
+
+impl<C> Validator<C>
+where
+    C: ControlPlaneGenerationsApi,
+{
+    pub(super) fn new(
+        conf: &'static PageServerConf,
+        rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
+        tx: tokio::sync::mpsc::Sender<DeleterMessage>,
+        control_plane_client: Option<C>,
+        lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
+        cancel: CancellationToken,
+    ) -> Self {
+        Self {
+            conf,
+            rx,
+            tx,
+            control_plane_client,
+            lsn_table,
+            pending_lists: Vec::new(),
+            validated_lists: Vec::new(),
+            pending_key_count: 0,
+            list_write_failed: None,
+            cancel,
+        }
+    }
+    /// Process any outstanding validations of generations of pending LSN updates or pending
+    /// DeletionLists.
+    ///
+    /// Valid LSN updates propagate back to Timelines immediately, valid DeletionLists
+    /// go into the queue of ready-to-execute lists.
+    async fn validate(&mut self) -> Result<(), DeletionQueueError> {
+        let mut tenant_generations = HashMap::new();
+        for list in &self.pending_lists {
+            for (tenant_id, tenant_list) in &list.tenants {
+                // Note: DeletionLists are in logical time order, so generation always
+                // goes up.  By doing a simple insert() we will always end up with
+                // the latest generation seen for a tenant.
+                tenant_generations.insert(*tenant_id, tenant_list.generation);
+            }
+        }
+
+        let pending_lsn_updates = {
+            let mut lsn_table = self.lsn_table.write().expect("Lock should not be poisoned");
+            std::mem::take(&mut *lsn_table)
+        };
+        for (tenant_id, update) in &pending_lsn_updates.tenants {
+            let entry = tenant_generations
+                .entry(*tenant_id)
+                .or_insert(update.generation);
+            if update.generation > *entry {
+                *entry = update.generation;
+            }
+        }
+
+        if tenant_generations.is_empty() {
+            // No work to do
+            return Ok(());
+        }
+
+        let tenants_valid = if let Some(control_plane_client) = &self.control_plane_client {
+            match control_plane_client
+                .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
+                .await
+            {
+                Ok(tenants) => tenants,
+                Err(RetryForeverError::ShuttingDown) => {
+                    // The only way a validation call returns an error is when the cancellation token fires
+                    return Err(DeletionQueueError::ShuttingDown);
+                }
+            }
+        } else {
+            // Control plane API disabled.  In legacy mode we consider everything valid.
+            tenant_generations.keys().map(|k| (*k, true)).collect()
+        };
+
+        let mut validated_sequence: Option<u64> = None;
+
+        // Apply the validation results to the pending LSN updates
+        for (tenant_id, tenant_lsn_state) in pending_lsn_updates.tenants {
+            let validated_generation = tenant_generations
+                .get(&tenant_id)
+                .expect("Map was built from the same keys we're reading");
+
+            let valid = tenants_valid
+                .get(&tenant_id)
+                .copied()
+                // If the tenant was missing from the validation response, it has been deleted.
+                // The Timeline that requested the LSN update is probably already torn down,
+                // or will be torn down soon.  In this case, drop the update by setting valid=false.
+                .unwrap_or(false);
+
+            if valid && *validated_generation == tenant_lsn_state.generation {
+                for (_timeline_id, pending_lsn) in tenant_lsn_state.timelines {
+                    pending_lsn.result_slot.store(pending_lsn.projected);
+                }
+            } else {
+                // If we failed validation, then do not apply any of the projected updates
+                warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
+                metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
+            }
+        }
+
+        // Apply the validation results to the pending deletion lists
+        for list in &mut self.pending_lists {
+            // Filter the list based on whether the server responded valid: true.
+            // If a tenant is omitted in the response, it has been deleted, and we should
+            // proceed with deletion.
+            let mut mutated = false;
+            list.tenants.retain(|tenant_id, tenant| {
+                let validated_generation = tenant_generations
+                    .get(tenant_id)
+                    .expect("Map was built from the same keys we're reading");
+
+                // If the tenant was missing from the validation response, it has been deleted.
+                // This means that a deletion is valid, but also redundant since the tenant's
+                // objects should have already been deleted.  Treat it as invalid to drop the
+                // redundant deletion.
+                let valid = tenants_valid.get(tenant_id).copied().unwrap_or(false);
+
+                // A list is valid if it comes from the current _or previous_ generation.
+                // - The previous generation case is permitted due to how we store deletion lists locally:
+                // if we see the immediately previous generation in a locally stored deletion list,
+                // it proves that this node's disk was used for both current & previous generations,
+                // and therefore no other node was involved in between: the two generations may be
+                // logically treated as the same.
+                // - In that previous generation case, we rewrote it to the current generation
+                // in recover(), so the comparison here is simply an equality.
+
+                let this_list_valid = valid
+                    && (tenant.generation == *validated_generation);
+
+                if !this_list_valid {
+                    warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
+                    metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
+                    mutated = true;
+                }
+                this_list_valid
+            });
+            list.validated = true;
+
+            if mutated {
+                // Save the deletion list if we had to make changes due to stale generations.  The
+                // saved list is valid for execution.
+                if let Err(e) = list.save(self.conf).await {
+                    // Highly unexpected.  Could happen if e.g. disk full.
+                    // If we didn't save the trimmed list, it is _not_ valid to execute.
+                    warn!("Failed to save modified deletion list {list}: {e:#}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
+
+                    // Rather than have a complex retry process, just drop it and leak the objects,
+                    // scrubber will clean up eventually.
+                    list.tenants.clear(); // Result is a valid-but-empty list, which is a no-op for execution.
+
+                    // We must remember this failure, to prevent later writing out a header that
+                    // would imply the unwritable list was valid on disk.
+                    if self.list_write_failed.is_none() {
+                        self.list_write_failed = Some(list.sequence);
+                    }
+                }
+            }
+
+            validated_sequence = Some(list.sequence);
+        }
+
+        if let Some(validated_sequence) = validated_sequence {
+            if let Some(list_write_failed) = self.list_write_failed {
+                // Rare error case: we failed to write out a deletion list to excise invalid
+                // entries, so we cannot advance the header's valid sequence number past that point.
+                //
+                // In this state we will continue to validate, execute and delete deletion lists,
+                // we just cannot update the header.  It should be noticed and fixed by a human due to
+                // the nonzero value of our unexpected_errors metric.
+                warn!(
+                    sequence_number = list_write_failed,
+                    "Cannot write header because writing a deletion list failed earlier",
+                );
+            } else {
+                // Write the queue header to record how far validation progressed.  This avoids having
+                // to rewrite each DeletionList to set validated=true in it.
+                let header = DeletionHeader::new(validated_sequence);
+
+                // Drop result because the validated_sequence is an optimization.  If we fail to save it,
+                // then restart, we will drop some deletion lists, creating work for scrubber.
+                // The save() function logs a warning on error.
+                if let Err(e) = header.save(self.conf).await {
+                    warn!("Failed to write deletion queue header: {e:#}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
+                }
+            }
+        }
+
+        // Transfer the validated lists to the validated queue, for eventual execution
+        self.validated_lists.append(&mut self.pending_lists);
+
+        Ok(())
+    }
+
+    async fn cleanup_lists(&mut self, list_paths: Vec<PathBuf>) {
+        for list_path in list_paths {
+            debug!("Removing deletion list {}", list_path.display());
+
+            if let Err(e) = tokio::fs::remove_file(&list_path).await {
+                // Unexpected: we should have permissions and nothing else should
+                // be touching these files.  We will leave the file behind.  Subsequent
+                // pageservers will try and load it again: hopefully whatever storage
+                // issue (probably permissions) has been fixed by then.
+                tracing::error!("Failed to delete {}: {e:#}", list_path.display());
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
+                break;
+            }
+        }
+    }
+
+    async fn flush(&mut self) -> Result<(), DeletionQueueError> {
+        tracing::debug!("Flushing with {} pending lists", self.pending_lists.len());
+
+        // Issue any required generation validation calls to the control plane
+        self.validate().await?;
+
+        // After successful validation, nothing is pending: any lists that
+        // made it through validation will be in validated_lists.
+        assert!(self.pending_lists.is_empty());
+        self.pending_key_count = 0;
+
+        tracing::debug!(
+            "Validation complete, have {} validated lists",
+            self.validated_lists.len()
+        );
+
+        // Return quickly if we have no validated lists to execute.  This avoids flushing the
+        // executor when an idle backend hits its autoflush interval
+        if self.validated_lists.is_empty() {
+            return Ok(());
+        }
+
+        // Drain `validated_lists` into the executor
+        let mut executing_lists = Vec::new();
+        for list in self.validated_lists.drain(..) {
+            let list_path = self.conf.deletion_list_path(list.sequence);
+            let objects = list.into_remote_paths();
+            self.tx
+                .send(DeleterMessage::Delete(objects))
+                .await
+                .map_err(|_| DeletionQueueError::ShuttingDown)?;
+            executing_lists.push(list_path);
+        }
+
+        self.flush_executor().await?;
+
+        // Erase the deletion lists whose keys have all be deleted from remote storage
+        self.cleanup_lists(executing_lists).await;
+
+        Ok(())
+    }
+
+    async fn flush_executor(&mut self) -> Result<(), DeletionQueueError> {
+        // Flush the executor, so that all the keys referenced by these deletion lists
+        // are actually removed from remote storage.  This is a precondition to deleting
+        // the deletion lists themselves.
+        let (flush_op, rx) = FlushOp::new();
+        self.tx
+            .send(DeleterMessage::Flush(flush_op))
+            .await
+            .map_err(|_| DeletionQueueError::ShuttingDown)?;
+
+        rx.await.map_err(|_| DeletionQueueError::ShuttingDown)
+    }
+
+    pub(super) async fn background(&mut self) {
+        tracing::info!("Started deletion backend worker");
+
+        while !self.cancel.is_cancelled() {
+            let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
+                Ok(Some(m)) => m,
+                Ok(None) => {
+                    // All queue senders closed
+                    info!("Shutting down");
+                    break;
+                }
+                Err(_) => {
+                    // Timeout, we hit deadline to execute whatever we have in hand.  These functions will
+                    // return immediately if no work is pending.
+                    match self.flush().await {
+                        Ok(()) => {}
+                        Err(DeletionQueueError::ShuttingDown) => {
+                            // If we are shutting down, then auto-flush can safely be skipped
+                        }
+                    }
+
+                    continue;
+                }
+            };
+
+            match msg {
+                ValidatorQueueMessage::Delete(list) => {
+                    if list.validated {
+                        // A pre-validated list may only be seen during recovery, if we are recovering
+                        // a DeletionList whose on-disk state has validated=true
+                        self.validated_lists.push(list)
+                    } else {
+                        self.pending_key_count += list.len();
+                        self.pending_lists.push(list);
+                    }
+
+                    if self.pending_key_count > AUTOFLUSH_KEY_COUNT {
+                        match self.flush().await {
+                            Ok(()) => {}
+                            Err(DeletionQueueError::ShuttingDown) => {
+                                // If we are shutting down, then auto-flush can safely be skipped
+                            }
+                        }
+                    }
+                }
+                ValidatorQueueMessage::Flush(op) => {
+                    match self.flush().await {
+                        Ok(()) => {
+                            op.notify();
+                        }
+                        Err(DeletionQueueError::ShuttingDown) => {
+                            // If we fail due to shutting down, we will just drop `op` to propagate that status.
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1093,6 +1093,9 @@ components:
        remote_consistent_lsn:
          type: string
          format: hex
+        remote_consistent_lsn_visible:
+          type: string
+          format: hex
        ancestor_timeline_id:
          type: string
          format: hex
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -5,6 +5,7 @@ use std::collections::HashMap;
 use std::sync::Arc;

 use anyhow::{anyhow, Context, Result};
+use futures::TryFutureExt;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
@@ -24,6 +25,7 @@ use super::models::{
    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
 };
 use crate::context::{DownloadBehavior, RequestContext};
+use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
@@ -34,7 +36,7 @@ use crate::tenant::mgr::{
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::timeline::Timeline;
-use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
+use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use utils::{
@@ -61,6 +63,7 @@ pub struct State {
    remote_storage: Option<GenericRemoteStorage>,
    broker_client: storage_broker::BrokerClientChannel,
    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
+    deletion_queue_client: DeletionQueueClient,
 }

 impl State {
@@ -70,6 +73,7 @@ impl State {
        remote_storage: Option<GenericRemoteStorage>,
        broker_client: storage_broker::BrokerClientChannel,
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
+        deletion_queue_client: DeletionQueueClient,
    ) -> anyhow::Result<Self> {
        let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
            .iter()
@@ -82,8 +86,17 @@ impl State {
            remote_storage,
            broker_client,
            disk_usage_eviction_state,
+            deletion_queue_client,
        })
    }
+
+    fn tenant_resources(&self) -> TenantSharedResources {
+        TenantSharedResources {
+            broker_client: self.broker_client.clone(),
+            remote_storage: self.remote_storage.clone(),
+            deletion_queue_client: self.deletion_queue_client.clone(),
+        }
+    }
 }

 #[inline(always)]
@@ -283,7 +296,12 @@ async fn build_timeline_info_common(
    };
    let current_physical_size = Some(timeline.layer_size_sum().await);
    let state = timeline.current_state();
-    let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
+    let remote_consistent_lsn_projected = timeline
+        .get_remote_consistent_lsn_projected()
+        .unwrap_or(Lsn(0));
+    let remote_consistent_lsn_visible = timeline
+        .get_remote_consistent_lsn_visible()
+        .unwrap_or(Lsn(0));

    let walreceiver_status = timeline.walreceiver_status();

@@ -293,7 +311,8 @@ async fn build_timeline_info_common(
        ancestor_timeline_id,
        ancestor_lsn,
        disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
-        remote_consistent_lsn,
+        remote_consistent_lsn: remote_consistent_lsn_projected,
+        remote_consistent_lsn_visible,
        last_record_lsn,
        prev_record_lsn: Some(timeline.get_prev_record_lsn()),
        latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
@@ -492,24 +511,23 @@ async fn tenant_attach_handler(

    let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;

-    if let Some(remote_storage) = &state.remote_storage {
-        mgr::attach_tenant(
-            state.conf,
-            tenant_id,
-            generation,
-            tenant_conf,
-            state.broker_client.clone(),
-            remote_storage.clone(),
-            &ctx,
-        )
-        .instrument(info_span!("tenant_attach", %tenant_id))
-        .await?;
-    } else {
+    if state.remote_storage.is_none() {
        return Err(ApiError::BadRequest(anyhow!(
            "attach_tenant is not possible because pageserver was configured without remote storage"
        )));
    }

+    mgr::attach_tenant(
+        state.conf,
+        tenant_id,
+        generation,
+        tenant_conf,
+        state.tenant_resources(),
+        &ctx,
+    )
+    .instrument(info_span!("tenant_attach", %tenant_id))
+    .await?;
+
    json_response(StatusCode::ACCEPTED, ())
 }

@@ -570,6 +588,7 @@ async fn tenant_load_handler(
        generation,
        state.broker_client.clone(),
        state.remote_storage.clone(),
+        state.deletion_queue_client.clone(),
        &ctx,
    )
    .instrument(info_span!("load", %tenant_id))
@@ -911,8 +930,7 @@ async fn tenant_create_handler(
        tenant_conf,
        target_tenant_id,
        generation,
-        state.broker_client.clone(),
-        state.remote_storage.clone(),
+        state.tenant_resources(),
        &ctx,
    )
    .instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
@@ -1129,6 +1147,39 @@ async fn timeline_download_remote_layers_handler_get(
    json_response(StatusCode::OK, info)
 }

+async fn deletion_queue_flush(
+    r: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&r);
+
+    if state.remote_storage.is_none() {
+        // Nothing to do if remote storage is disabled.
+        return json_response(StatusCode::OK, ());
+    }
+
+    let execute = parse_query_param(&r, "execute")?.unwrap_or(false);
+
+    let flush = async {
+        if execute {
+            state.deletion_queue_client.flush_execute().await
+        } else {
+            state.deletion_queue_client.flush().await
+        }
+    }
+    // DeletionQueueError's only case is shutting down.
+    .map_err(|_| ApiError::ShuttingDown);
+
+    tokio::select! {
+        res = flush => {
+            res.map(|()| json_response(StatusCode::OK, ()))?
+        }
+        _ = cancel.cancelled() => {
+            Err(ApiError::ShuttingDown)
+        }
+    }
+}
+
 async fn active_timeline_of_active_tenant(
    tenant_id: TenantId,
    timeline_id: TimelineId,
@@ -1463,6 +1514,9 @@ pub fn make_router(
        .put("/v1/disk_usage_eviction/run", |r| {
            api_handler(r, disk_usage_eviction_run)
        })
+        .put("/v1/deletion_queue/flush", |r| {
+            api_handler(r, deletion_queue_flush)
+        })
        .put("/v1/tenant/:tenant_id/break", |r| {
            testing_api_handler("set tenant state to broken", r, handle_tenant_break)
        })
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -75,12 +75,12 @@ pub async fn import_timeline_from_postgres_datadir(
            {
                pg_control = Some(control_file);
            }
-            modification.flush().await?;
+            modification.flush(ctx).await?;
        }
    }

    // We're done importing all the data files.
-    modification.commit().await?;
+    modification.commit(ctx).await?;

    // We expect the Postgres server to be shut down cleanly.
    let pg_control = pg_control.context("pg_control file not found")?;
@@ -359,7 +359,7 @@ pub async fn import_basebackup_from_tar(
                    // We found the pg_control file.
                    pg_control = Some(res);
                }
-                modification.flush().await?;
+                modification.flush(ctx).await?;
            }
            tokio_tar::EntryType::Directory => {
                debug!("directory {:?}", file_path);
@@ -377,7 +377,7 @@ pub async fn import_basebackup_from_tar(
    // sanity check: ensure that pg_control is loaded
    let _pg_control = pg_control.context("pg_control file not found")?;

-    modification.commit().await?;
+    modification.commit(ctx).await?;
    Ok(())
 }

--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -3,7 +3,8 @@ pub mod basebackup;
 pub mod config;
 pub mod consumption_metrics;
 pub mod context;
-mod control_plane_client;
+pub mod control_plane_client;
+pub mod deletion_queue;
 pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
@@ -27,6 +28,7 @@ pub mod failpoint_support;
 use std::path::Path;

 use crate::task_mgr::TaskKind;
+use deletion_queue::DeletionQueue;
 use tracing::info;

 /// Current storage format version
@@ -48,8 +50,8 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);

 pub use crate::metrics::preinitialize_metrics;

-#[tracing::instrument]
-pub async fn shutdown_pageserver(exit_code: i32) {
+#[tracing::instrument(skip_all, fields(%exit_code))]
+pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_code: i32) {
    use std::time::Duration;
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
@@ -77,6 +79,11 @@ pub async fn shutdown_pageserver(exit_code: i32) {
    )
    .await;

+    // Best effort to persist any outstanding deletions, to avoid leaking objects
+    if let Some(mut deletion_queue) = deletion_queue {
+        deletion_queue.shutdown(Duration::from_secs(5)).await;
+    }
+
    // Shut down the HTTP endpoint last, so that you can still check the server's
    // status while it's shutting down.
    // FIXME: We should probably stop accepting commands like attach/detach earlier.
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,3 +1,4 @@
+use enum_map::EnumMap;
 use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
    register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
@@ -127,22 +128,24 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub struct PageCacheMetrics {
+pub struct PageCacheMetricsForTaskKind {
    pub read_accesses_materialized_page: IntCounter,
-    pub read_accesses_ephemeral: IntCounter,
    pub read_accesses_immutable: IntCounter,

-    pub read_hits_ephemeral: IntCounter,
    pub read_hits_immutable: IntCounter,
    pub read_hits_materialized_page_exact: IntCounter,
    pub read_hits_materialized_page_older_lsn: IntCounter,
 }

+pub struct PageCacheMetrics {
+    map: EnumMap<TaskKind, EnumMap<PageContentKind, PageCacheMetricsForTaskKind>>,
+}
+
 static PAGE_CACHE_READ_HITS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_page_cache_read_hits_total",
        "Number of read accesses to the page cache that hit",
-        &["key_kind", "hit_kind"]
+        &["task_kind", "key_kind", "content_kind", "hit_kind"]
    )
    .expect("failed to define a metric")
 });
@@ -151,55 +154,73 @@ static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_page_cache_read_accesses_total",
        "Number of read accesses to the page cache",
-        &["key_kind"]
+        &["task_kind", "key_kind", "content_kind"]
    )
    .expect("failed to define a metric")
 });

 pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
-    read_accesses_materialized_page: {
-        PAGE_CACHE_READ_ACCESSES
-            .get_metric_with_label_values(&["materialized_page"])
-            .unwrap()
-    },
+    map: EnumMap::from_array(std::array::from_fn(|task_kind| {
+        let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
+        let task_kind: &'static str = task_kind.into();
+        EnumMap::from_array(std::array::from_fn(|content_kind| {
+            let content_kind = <PageContentKind as enum_map::Enum>::from_usize(content_kind);
+            let content_kind: &'static str = content_kind.into();
+            PageCacheMetricsForTaskKind {
+                read_accesses_materialized_page: {
+                    PAGE_CACHE_READ_ACCESSES
+                        .get_metric_with_label_values(&[
+                            task_kind,
+                            "materialized_page",
+                            content_kind,
+                        ])
+                        .unwrap()
+                },

-    read_accesses_ephemeral: {
-        PAGE_CACHE_READ_ACCESSES
-            .get_metric_with_label_values(&["ephemeral"])
-            .unwrap()
-    },
+                read_accesses_immutable: {
+                    PAGE_CACHE_READ_ACCESSES
+                        .get_metric_with_label_values(&[task_kind, "immutable", content_kind])
+                        .unwrap()
+                },

-    read_accesses_immutable: {
-        PAGE_CACHE_READ_ACCESSES
-            .get_metric_with_label_values(&["immutable"])
-            .unwrap()
-    },
+                read_hits_immutable: {
+                    PAGE_CACHE_READ_HITS
+                        .get_metric_with_label_values(&[task_kind, "immutable", content_kind, "-"])
+                        .unwrap()
+                },

-    read_hits_ephemeral: {
-        PAGE_CACHE_READ_HITS
-            .get_metric_with_label_values(&["ephemeral", "-"])
-            .unwrap()
-    },
+                read_hits_materialized_page_exact: {
+                    PAGE_CACHE_READ_HITS
+                        .get_metric_with_label_values(&[
+                            task_kind,
+                            "materialized_page",
+                            content_kind,
+                            "exact",
+                        ])
+                        .unwrap()
+                },

-    read_hits_immutable: {
-        PAGE_CACHE_READ_HITS
-            .get_metric_with_label_values(&["immutable", "-"])
-            .unwrap()
-    },
-
-    read_hits_materialized_page_exact: {
-        PAGE_CACHE_READ_HITS
-            .get_metric_with_label_values(&["materialized_page", "exact"])
-            .unwrap()
-    },
-
-    read_hits_materialized_page_older_lsn: {
-        PAGE_CACHE_READ_HITS
-            .get_metric_with_label_values(&["materialized_page", "older_lsn"])
-            .unwrap()
-    },
+                read_hits_materialized_page_older_lsn: {
+                    PAGE_CACHE_READ_HITS
+                        .get_metric_with_label_values(&[
+                            task_kind,
+                            "materialized_page",
+                            content_kind,
+                            "older_lsn",
+                        ])
+                        .unwrap()
+                },
+            }
+        }))
+    })),
 });

+impl PageCacheMetrics {
+    pub(crate) fn for_ctx(&self, ctx: &RequestContext) -> &PageCacheMetricsForTaskKind {
+        &self.map[ctx.task_kind()][ctx.page_content_kind()]
+    }
+}
+
 pub struct PageCacheSizeMetrics {
    pub max_bytes: UIntGauge,

@@ -270,6 +291,14 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_resident_physical_size_global",
+        "Like `pageserver_resident_physical_size`, but without tenant/timeline dimensions."
+    )
+    .expect("failed to define a metric")
+});
+
 static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_remote_physical_size",
@@ -280,6 +309,14 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+static REMOTE_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_remote_physical_size_global",
+        "Like `pageserver_remote_physical_size`, but without tenant/timeline dimensions."
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_remote_ondemand_downloaded_layers_total",
@@ -866,6 +903,54 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy
    .expect("failed to define a metric")
 });

+pub(crate) struct DeletionQueueMetrics {
+    pub(crate) keys_submitted: IntCounter,
+    pub(crate) keys_dropped: IntCounter,
+    pub(crate) keys_executed: IntCounter,
+    pub(crate) dropped_lsn_updates: IntCounter,
+    pub(crate) unexpected_errors: IntCounter,
+    pub(crate) remote_errors: IntCounterVec,
+}
+pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
+    DeletionQueueMetrics{
+
+    keys_submitted: register_int_counter!(
+        "pageserver_deletion_queue_submitted_total",
+        "Number of objects submitted for deletion"
+    )
+    .expect("failed to define a metric"),
+
+    keys_dropped: register_int_counter!(
+        "pageserver_deletion_queue_dropped_total",
+        "Number of object deletions dropped due to stale generation."
+    )
+    .expect("failed to define a metric"),
+
+    keys_executed: register_int_counter!(
+        "pageserver_deletion_queue_executed_total",
+        "Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed."
+    )
+    .expect("failed to define a metric"),
+
+    dropped_lsn_updates: register_int_counter!(
+        "pageserver_deletion_queue_dropped_lsn_updates_total",
+        "Updates to remote_consistent_lsn dropped due to stale generation number."
+    )
+    .expect("failed to define a metric"),
+    unexpected_errors: register_int_counter!(
+        "pageserver_deletion_queue_unexpected_errors_total",
+        "Number of unexpected condiions that may stall the queue: any value above zero is unexpected."
+    )
+    .expect("failed to define a metric"),
+    remote_errors: register_int_counter_vec!(
+        "pageserver_deletion_queue_remote_errors_total",
+        "Retryable remote I/O errors while executing deletions, for example 503 responses to DeleteObjects",
+        &["op_kind"],
+    )
+    .expect("failed to define a metric")
+}
+});
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
    Upload,
@@ -1140,7 +1225,7 @@ pub struct TimelineMetrics {
    pub load_layer_map_histo: StorageTimeMetrics,
    pub garbage_collect_histo: StorageTimeMetrics,
    pub last_record_gauge: IntGauge,
-    pub resident_physical_size_gauge: UIntGauge,
+    resident_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
    pub num_persistent_files_created: IntCounter,
@@ -1218,10 +1303,29 @@ impl TimelineMetrics {
    }

    pub fn record_new_file_metrics(&self, sz: u64) {
-        self.resident_physical_size_gauge.add(sz);
+        self.resident_physical_size_add(sz);
        self.num_persistent_files_created.inc_by(1);
        self.persistent_bytes_written.inc_by(sz);
    }
+
+    pub fn resident_physical_size_sub(&self, sz: u64) {
+        self.resident_physical_size_gauge.sub(sz);
+        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
+    }
+
+    pub fn resident_physical_size_add(&self, sz: u64) {
+        self.resident_physical_size_gauge.add(sz);
+        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
+    }
+
+    pub fn resident_physical_size_set(&self, sz: u64) {
+        self.resident_physical_size_gauge.set(sz);
+        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.set(sz);
+    }
+
+    pub fn resident_physical_size_get(&self) -> u64 {
+        self.resident_physical_size_gauge.get()
+    }
 }

 impl Drop for TimelineMetrics {
@@ -1229,7 +1333,10 @@ impl Drop for TimelineMetrics {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+        {
+            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
+            let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+        }
        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
@@ -1280,10 +1387,46 @@ use std::sync::{Arc, Mutex};
 use std::task::{Context, Poll};
 use std::time::{Duration, Instant};

+use crate::context::{PageContentKind, RequestContext};
+use crate::task_mgr::TaskKind;
+
+/// Maintain a per timeline gauge in addition to the global gauge.
+struct PerTimelineRemotePhysicalSizeGauge {
+    last_set: u64,
+    gauge: UIntGauge,
+}
+
+impl PerTimelineRemotePhysicalSizeGauge {
+    fn new(per_timeline_gauge: UIntGauge) -> Self {
+        Self {
+            last_set: per_timeline_gauge.get(),
+            gauge: per_timeline_gauge,
+        }
+    }
+    fn set(&mut self, sz: u64) {
+        self.gauge.set(sz);
+        if sz < self.last_set {
+            REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set - sz);
+        } else {
+            REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - self.last_set);
+        };
+        self.last_set = sz;
+    }
+    fn get(&self) -> u64 {
+        self.gauge.get()
+    }
+}
+
+impl Drop for PerTimelineRemotePhysicalSizeGauge {
+    fn drop(&mut self) {
+        REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set);
+    }
+}
+
 pub struct RemoteTimelineClientMetrics {
    tenant_id: String,
    timeline_id: String,
-    remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
+    remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
    calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
    bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
    bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
@@ -1301,18 +1444,24 @@ impl RemoteTimelineClientMetrics {
        }
    }

-    pub fn remote_physical_size_gauge(&self) -> UIntGauge {
+    pub(crate) fn remote_physical_size_set(&self, sz: u64) {
        let mut guard = self.remote_physical_size_gauge.lock().unwrap();
-        guard
-            .get_or_insert_with(|| {
+        let gauge = guard.get_or_insert_with(|| {
+            PerTimelineRemotePhysicalSizeGauge::new(
                REMOTE_PHYSICAL_SIZE
                    .get_metric_with_label_values(&[
                        &self.tenant_id.to_string(),
                        &self.timeline_id.to_string(),
                    ])
-                    .unwrap()
-            })
-            .clone()
+                    .unwrap(),
+            )
+        });
+        gauge.set(sz);
+    }
+
+    pub(crate) fn remote_physical_size_get(&self) -> u64 {
+        let guard = self.remote_physical_size_gauge.lock().unwrap();
+        guard.as_ref().map(|gauge| gauge.get()).unwrap_or(0)
    }

    pub fn remote_operation_time(
@@ -1651,6 +1800,9 @@ pub fn preinitialize_metrics() {
        Lazy::force(c);
    });

+    // Deletion queue stats
+    Lazy::force(&DELETION_QUEUE);
+
    // countervecs
    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
        .into_iter()
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -85,7 +85,7 @@ use utils::{
    lsn::Lsn,
 };

-use crate::{metrics::PageCacheSizeMetrics, repository::Key};
+use crate::{context::RequestContext, metrics::PageCacheSizeMetrics, repository::Key};

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
 const TEST_PAGE_CACHE_SIZE: usize = 50;
@@ -346,8 +346,10 @@ impl PageCache {
        timeline_id: TimelineId,
        key: &Key,
        lsn: Lsn,
+        ctx: &RequestContext,
    ) -> Option<(Lsn, PageReadGuard)> {
        crate::metrics::PAGE_CACHE
+            .for_ctx(ctx)
            .read_accesses_materialized_page
            .inc();

@@ -368,10 +370,12 @@ impl PageCache {
            {
                if available_lsn == lsn {
                    crate::metrics::PAGE_CACHE
+                        .for_ctx(ctx)
                        .read_hits_materialized_page_exact
                        .inc();
                } else {
                    crate::metrics::PAGE_CACHE
+                        .for_ctx(ctx)
                        .read_hits_materialized_page_older_lsn
                        .inc();
                }
@@ -426,10 +430,11 @@ impl PageCache {
        &self,
        file_id: FileId,
        blkno: u32,
+        ctx: &RequestContext,
    ) -> anyhow::Result<ReadBufResult> {
        let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };

-        self.lock_for_read(&mut cache_key).await
+        self.lock_for_read(&mut cache_key, ctx).await
    }

    //
@@ -497,14 +502,20 @@ impl PageCache {
    /// }
    /// ```
    ///
-    async fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
+    async fn lock_for_read(
+        &self,
+        cache_key: &mut CacheKey,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ReadBufResult> {
        let (read_access, hit) = match cache_key {
            CacheKey::MaterializedPage { .. } => {
                unreachable!("Materialized pages use lookup_materialized_page")
            }
            CacheKey::ImmutableFilePage { .. } => (
-                &crate::metrics::PAGE_CACHE.read_accesses_immutable,
-                &crate::metrics::PAGE_CACHE.read_hits_immutable,
+                &crate::metrics::PAGE_CACHE
+                    .for_ctx(ctx)
+                    .read_accesses_immutable,
+                &crate::metrics::PAGE_CACHE.for_ctx(ctx).read_hits_immutable,
            ),
        };
        read_access.inc();
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1138,7 +1138,7 @@ impl<'a> DatadirModification<'a> {
    /// retains all the metadata, but data pages are flushed. That's again OK
    /// for bulk import, where you are just loading data pages and won't try to
    /// modify the same pages twice.
-    pub async fn flush(&mut self) -> anyhow::Result<()> {
+    pub async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
        // Unless we have accumulated a decent amount of changes, it's not worth it
        // to scan through the pending_updates list.
        let pending_nblocks = self.pending_nblocks;
@@ -1154,7 +1154,7 @@ impl<'a> DatadirModification<'a> {
            if is_rel_block_key(key) || is_slru_block_key(key) {
                // This bails out on first error without modifying pending_updates.
                // That's Ok, cf this function's doc comment.
-                writer.put(key, self.lsn, &value).await?;
+                writer.put(key, self.lsn, &value, ctx).await?;
            } else {
                retained_pending_updates.insert(key, value);
            }
@@ -1174,14 +1174,14 @@ impl<'a> DatadirModification<'a> {
    /// underlying timeline.
    /// All the modifications in this atomic update are stamped by the specified LSN.
    ///
-    pub async fn commit(&mut self) -> anyhow::Result<()> {
+    pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
        let writer = self.tline.writer().await;
        let lsn = self.lsn;
        let pending_nblocks = self.pending_nblocks;
        self.pending_nblocks = 0;

        for (key, value) in self.pending_updates.drain() {
-            writer.put(key, lsn, &value).await?;
+            writer.put(key, lsn, &value, ctx).await?;
        }
        for key_range in self.pending_deletions.drain(..) {
            writer.delete(key_range, lsn).await?;
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -37,7 +37,7 @@ impl Key {
            | self.field6 as i128
    }

-    pub fn from_i128(x: i128) -> Self {
+    pub const fn from_i128(x: i128) -> Self {
        Key {
            field1: ((x >> 120) & 0xf) as u8,
            field2: ((x >> 104) & 0xFFFF) as u32,
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -187,6 +187,7 @@ task_local! {
    Debug,
    // NB: enumset::EnumSetType derives PartialEq, Eq, Clone, Copy
    enumset::EnumSetType,
+    enum_map::Enum,
    serde::Serialize,
    serde::Deserialize,
    strum_macros::IntoStaticStr,
@@ -455,7 +456,7 @@ async fn task_finish(
    }

    if shutdown_process {
-        shutdown_pageserver(1).await;
+        shutdown_pageserver(None, 1).await;
    }
 }

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -57,6 +57,7 @@ use self::timeline::EvictionTaskTenantState;
 use self::timeline::TimelineResources;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
+use crate::deletion_queue::DeletionQueueClient;
 use crate::import_datadir;
 use crate::is_uninit_mark;
 use crate::metrics::TENANT_ACTIVATION;
@@ -117,7 +118,7 @@ mod span;

 pub mod metadata;
 mod par_fsync;
-mod remote_timeline_client;
+pub mod remote_timeline_client;
 pub mod storage_layer;

 pub mod config;
@@ -157,6 +158,7 @@ pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
 pub struct TenantSharedResources {
    pub broker_client: storage_broker::BrokerClientChannel,
    pub remote_storage: Option<GenericRemoteStorage>,
+    pub deletion_queue_client: DeletionQueueClient,
 }

 ///
@@ -197,6 +199,9 @@ pub struct Tenant {
    // provides access to timeline data sitting in the remote storage
    pub(crate) remote_storage: Option<GenericRemoteStorage>,

+    // Access to global deletion queue for when this tenant wants to schedule a deletion
+    deletion_queue_client: DeletionQueueClient,
+
    /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
    cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
    cached_synthetic_tenant_size: Arc<AtomicU64>,
@@ -523,15 +528,20 @@ impl Tenant {
        conf: &'static PageServerConf,
        tenant_id: TenantId,
        generation: Generation,
-        broker_client: storage_broker::BrokerClientChannel,
+        resources: TenantSharedResources,
        tenants: &'static tokio::sync::RwLock<TenantsMap>,
-        remote_storage: GenericRemoteStorage,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Tenant>> {
        // TODO dedup with spawn_load
        let tenant_conf =
            Self::load_tenant_config(conf, &tenant_id).context("load tenant config")?;

+        let TenantSharedResources {
+            broker_client,
+            remote_storage,
+            deletion_queue_client,
+        } = resources;
+
        let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
        let tenant = Arc::new(Tenant::new(
            TenantState::Attaching,
@@ -540,7 +550,8 @@ impl Tenant {
            wal_redo_manager,
            tenant_id,
            generation,
-            Some(remote_storage.clone()),
+            remote_storage.clone(),
+            deletion_queue_client,
        ));

        // Do all the hard work in the background
@@ -571,7 +582,7 @@ impl Tenant {
                let pending_deletion = {
                    match DeleteTenantFlow::should_resume_deletion(
                        conf,
-                        Some(&remote_storage),
+                        remote_storage.as_ref(),
                        &tenant_clone,
                    )
                    .await
@@ -660,6 +671,7 @@ impl Tenant {
        for timeline_id in remote_timeline_ids {
            let client = RemoteTimelineClient::new(
                remote_storage.clone(),
+                self.deletion_queue_client.clone(),
                self.conf,
                self.tenant_id,
                timeline_id,
@@ -726,6 +738,7 @@ impl Tenant {
                remote_metadata,
                TimelineResources {
                    remote_client: Some(remote_client),
+                    deletion_queue_client: self.deletion_queue_client.clone(),
                },
                ctx,
            )
@@ -750,6 +763,7 @@ impl Tenant {
                timeline_id,
                &index_part.metadata,
                Some(remote_timeline_client),
+                self.deletion_queue_client.clone(),
                None,
            )
            .await
@@ -851,6 +865,7 @@ impl Tenant {
            tenant_id,
            Generation::broken(),
            None,
+            DeletionQueueClient::broken(),
        ))
    }

@@ -895,6 +910,7 @@ impl Tenant {
            tenant_id,
            generation,
            remote_storage.clone(),
+            resources.deletion_queue_client.clone(),
        );
        let tenant = Arc::new(tenant);

@@ -1302,6 +1318,7 @@ impl Tenant {
                                timeline_id,
                                &local_metadata,
                                Some(remote_client),
+                                self.deletion_queue_client.clone(),
                                init_order,
                            )
                            .await
@@ -1351,6 +1368,7 @@ impl Tenant {
                        timeline_id,
                        &local_metadata,
                        None,
+                        self.deletion_queue_client.clone(),
                        init_order,
                    )
                    .await
@@ -1504,7 +1522,7 @@ impl Tenant {
            .init_empty_test_timeline()
            .context("init_empty_test_timeline")?;
        modification
-            .commit()
+            .commit(ctx)
            .await
            .context("commit init_empty_test_timeline modification")?;

@@ -2242,6 +2260,9 @@ impl Tenant {
        Ok(timeline)
    }

+    // Allow too_many_arguments because a constructor's argument list naturally grows with the
+    // number of attributes in the struct: breaking these out into a builder wouldn't be helpful.
+    #[allow(clippy::too_many_arguments)]
    fn new(
        state: TenantState,
        conf: &'static PageServerConf,
@@ -2250,6 +2271,7 @@ impl Tenant {
        tenant_id: TenantId,
        generation: Generation,
        remote_storage: Option<GenericRemoteStorage>,
+        deletion_queue_client: DeletionQueueClient,
    ) -> Tenant {
        let (state, mut rx) = watch::channel(state);

@@ -2317,6 +2339,7 @@ impl Tenant {
            gc_cs: tokio::sync::Mutex::new(()),
            walredo_mgr,
            remote_storage,
+            deletion_queue_client,
            state,
            cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
            cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
@@ -2856,6 +2879,7 @@ impl Tenant {
        let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
            let remote_client = RemoteTimelineClient::new(
                remote_storage.clone(),
+                self.deletion_queue_client.clone(),
                self.conf,
                self.tenant_id,
                timeline_id,
@@ -2866,7 +2890,10 @@ impl Tenant {
            None
        };

-        TimelineResources { remote_client }
+        TimelineResources {
+            remote_client,
+            deletion_queue_client: self.deletion_queue_client.clone(),
+        }
    }

    /// Creates intermediate timeline structure and its files.
@@ -3322,6 +3349,7 @@ pub mod harness {
    use utils::logging;
    use utils::lsn::Lsn;

+    use crate::deletion_queue::mock::MockDeletionQueue;
    use crate::{
        config::PageServerConf,
        repository::Key,
@@ -3383,6 +3411,7 @@ pub mod harness {
        pub generation: Generation,
        pub remote_storage: GenericRemoteStorage,
        pub remote_fs_dir: PathBuf,
+        pub deletion_queue: MockDeletionQueue,
    }

    static LOG_HANDLE: OnceCell<()> = OnceCell::new();
@@ -3431,6 +3460,7 @@ pub mod harness {
                storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
            };
            let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
+            let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));

            Ok(Self {
                conf,
@@ -3439,6 +3469,7 @@ pub mod harness {
                generation: Generation::new(0xdeadbeef),
                remote_storage,
                remote_fs_dir,
+                deletion_queue,
            })
        }

@@ -3463,6 +3494,7 @@ pub mod harness {
                self.tenant_id,
                self.generation,
                Some(self.remote_storage.clone()),
+                self.deletion_queue.new_client(),
            ));
            tenant
                .load(None, ctx)
@@ -3538,14 +3570,24 @@ mod tests {

        let writer = tline.writer().await;
        writer
-            .put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))
+            .put(
+                *TEST_KEY,
+                Lsn(0x10),
+                &Value::Image(TEST_IMG("foo at 0x10")),
+                &ctx,
+            )
            .await?;
        writer.finish_write(Lsn(0x10));
        drop(writer);

        let writer = tline.writer().await;
        writer
-            .put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))
+            .put(
+                *TEST_KEY,
+                Lsn(0x20),
+                &Value::Image(TEST_IMG("foo at 0x20")),
+                &ctx,
+            )
            .await?;
        writer.finish_write(Lsn(0x20));
        drop(writer);
@@ -3619,19 +3661,19 @@ mod tests {

        // Insert a value on the timeline
        writer
-            .put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"))
+            .put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"), &ctx)
            .await?;
        writer
-            .put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"))
+            .put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"), &ctx)
            .await?;
        writer.finish_write(Lsn(0x20));

        writer
-            .put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"))
+            .put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"), &ctx)
            .await?;
        writer.finish_write(Lsn(0x30));
        writer
-            .put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"))
+            .put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"), &ctx)
            .await?;
        writer.finish_write(Lsn(0x40));

@@ -3646,7 +3688,7 @@ mod tests {
            .expect("Should have a local timeline");
        let new_writer = newtline.writer().await;
        new_writer
-            .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))
+            .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx)
            .await?;
        new_writer.finish_write(Lsn(0x40));

@@ -3669,7 +3711,11 @@ mod tests {
        Ok(())
    }

-    async fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> anyhow::Result<()> {
+    async fn make_some_layers(
+        tline: &Timeline,
+        start_lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
        let mut lsn = start_lsn;
        #[allow(non_snake_case)]
        {
@@ -3680,6 +3726,7 @@ mod tests {
                    *TEST_KEY,
                    lsn,
                    &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
+                    ctx,
                )
                .await?;
            writer.finish_write(lsn);
@@ -3689,6 +3736,7 @@ mod tests {
                    *TEST_KEY,
                    lsn,
                    &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
+                    ctx,
                )
                .await?;
            writer.finish_write(lsn);
@@ -3702,6 +3750,7 @@ mod tests {
                    *TEST_KEY,
                    lsn,
                    &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
+                    ctx,
                )
                .await?;
            writer.finish_write(lsn);
@@ -3711,6 +3760,7 @@ mod tests {
                    *TEST_KEY,
                    lsn,
                    &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
+                    ctx,
                )
                .await?;
            writer.finish_write(lsn);
@@ -3727,7 +3777,7 @@ mod tests {
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
-        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
+        make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

        // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
        // FIXME: this doesn't actually remove any layer currently, given how the flushing
@@ -3801,7 +3851,7 @@ mod tests {
            .load();

        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
-        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
+        make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

        repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)?;
        let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
@@ -3823,7 +3873,7 @@ mod tests {
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
-        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
+        make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

        tenant
            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
@@ -3832,7 +3882,7 @@ mod tests {
            .get_timeline(NEW_TIMELINE_ID, true)
            .expect("Should have a local timeline");

-        make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
+        make_some_layers(newtline.as_ref(), Lsn(0x60), &ctx).await?;

        tline.set_broken("test".to_owned());

@@ -3873,7 +3923,7 @@ mod tests {
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
-        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
+        make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

        tenant
            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
@@ -3898,7 +3948,7 @@ mod tests {
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
-        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
+        make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

        tenant
            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
@@ -3907,7 +3957,7 @@ mod tests {
            .get_timeline(NEW_TIMELINE_ID, true)
            .expect("Should have a local timeline");

-        make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
+        make_some_layers(newtline.as_ref(), Lsn(0x60), &ctx).await?;

        // run gc on parent
        tenant
@@ -3932,7 +3982,7 @@ mod tests {
            let tline = tenant
                .create_test_timeline(TIMELINE_ID, Lsn(0x7000), DEFAULT_PG_VERSION, &ctx)
                .await?;
-            make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
+            make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?;
            // so that all uploads finish & we can call harness.load() below again
            tenant
                .shutdown(Default::default(), true)
@@ -3961,7 +4011,7 @@ mod tests {
                .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
                .await?;

-            make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
+            make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

            let child_tline = tenant
                .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
@@ -3972,7 +4022,7 @@ mod tests {
                .get_timeline(NEW_TIMELINE_ID, true)
                .expect("Should have a local timeline");

-            make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
+            make_some_layers(newtline.as_ref(), Lsn(0x60), &ctx).await?;

            // so that all uploads finish & we can call harness.load() below again
            tenant
@@ -4004,7 +4054,7 @@ mod tests {
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
-        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
+        make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

        let layer_map = tline.layers.read().await;
        let level0_deltas = layer_map.layer_map().get_level0_deltas()?;
@@ -4087,7 +4137,12 @@ mod tests {

        let writer = tline.writer().await;
        writer
-            .put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))
+            .put(
+                *TEST_KEY,
+                Lsn(0x10),
+                &Value::Image(TEST_IMG("foo at 0x10")),
+                &ctx,
+            )
            .await?;
        writer.finish_write(Lsn(0x10));
        drop(writer);
@@ -4097,7 +4152,12 @@ mod tests {

        let writer = tline.writer().await;
        writer
-            .put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))
+            .put(
+                *TEST_KEY,
+                Lsn(0x20),
+                &Value::Image(TEST_IMG("foo at 0x20")),
+                &ctx,
+            )
            .await?;
        writer.finish_write(Lsn(0x20));
        drop(writer);
@@ -4107,7 +4167,12 @@ mod tests {

        let writer = tline.writer().await;
        writer
-            .put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))
+            .put(
+                *TEST_KEY,
+                Lsn(0x30),
+                &Value::Image(TEST_IMG("foo at 0x30")),
+                &ctx,
+            )
            .await?;
        writer.finish_write(Lsn(0x30));
        drop(writer);
@@ -4117,7 +4182,12 @@ mod tests {

        let writer = tline.writer().await;
        writer
-            .put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))
+            .put(
+                *TEST_KEY,
+                Lsn(0x40),
+                &Value::Image(TEST_IMG("foo at 0x40")),
+                &ctx,
+            )
            .await?;
        writer.finish_write(Lsn(0x40));
        drop(writer);
@@ -4155,7 +4225,8 @@ mod tests {
    //
    #[tokio::test]
    async fn test_bulk_insert() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_bulk_insert")?.load().await;
+        let harness = TenantHarness::create("test_bulk_insert")?;
+        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;
@@ -4175,6 +4246,7 @@ mod tests {
                        test_key,
                        lsn,
                        &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                        &ctx,
                    )
                    .await?;
                writer.finish_write(lsn);
@@ -4201,7 +4273,8 @@ mod tests {

    #[tokio::test]
    async fn test_random_updates() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_random_updates")?.load().await;
+        let harness = TenantHarness::create("test_random_updates")?;
+        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
@@ -4227,6 +4300,7 @@ mod tests {
                    test_key,
                    lsn,
                    &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                    &ctx,
                )
                .await?;
            writer.finish_write(lsn);
@@ -4247,6 +4321,7 @@ mod tests {
                        test_key,
                        lsn,
                        &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                        &ctx,
                    )
                    .await?;
                writer.finish_write(lsn);
@@ -4306,6 +4381,7 @@ mod tests {
                    test_key,
                    lsn,
                    &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                    &ctx,
                )
                .await?;
            writer.finish_write(lsn);
@@ -4334,6 +4410,7 @@ mod tests {
                        test_key,
                        lsn,
                        &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                        &ctx,
                    )
                    .await?;
                println!("updating {} at {}", blknum, lsn);
@@ -4402,6 +4479,7 @@ mod tests {
                        test_key,
                        lsn,
                        &Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))),
+                        &ctx,
                    )
                    .await?;
                println!("updating [{}][{}] at {}", idx, blknum, lsn);
@@ -4474,7 +4552,7 @@ mod tests {
            .init_empty_test_timeline()
            .context("init_empty_test_timeline")?;
        modification
-            .commit()
+            .commit(&ctx)
            .await
            .context("commit init_empty_test_timeline modification")?;

--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -11,6 +11,7 @@
 //! len <  128: 0XXXXXXX
 //! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
+use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
 use crate::tenant::block_io::BlockCursor;
 use crate::virtual_file::VirtualFile;
@@ -19,9 +20,13 @@ use std::io::{Error, ErrorKind};

 impl<'a> BlockCursor<'a> {
    /// Read a blob into a new buffer.
-    pub async fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
+    pub async fn read_blob(
+        &self,
+        offset: u64,
+        ctx: &RequestContext,
+    ) -> Result<Vec<u8>, std::io::Error> {
        let mut buf = Vec::new();
-        self.read_blob_into_buf(offset, &mut buf).await?;
+        self.read_blob_into_buf(offset, &mut buf, ctx).await?;
        Ok(buf)
    }
    /// Read blob into the given buffer. Any previous contents in the buffer
@@ -30,11 +35,12 @@ impl<'a> BlockCursor<'a> {
        &self,
        offset: u64,
        dstbuf: &mut Vec<u8>,
+        ctx: &RequestContext,
    ) -> Result<(), std::io::Error> {
        let mut blknum = (offset / PAGE_SZ as u64) as u32;
        let mut off = (offset % PAGE_SZ as u64) as usize;

-        let mut buf = self.read_blk(blknum).await?;
+        let mut buf = self.read_blk(blknum, ctx).await?;

        // peek at the first byte, to determine if it's a 1- or 4-byte length
        let first_len_byte = buf[off];
@@ -50,7 +56,7 @@ impl<'a> BlockCursor<'a> {
                // it is split across two pages
                len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]);
                blknum += 1;
-                buf = self.read_blk(blknum).await?;
+                buf = self.read_blk(blknum, ctx).await?;
                len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]);
                off = 4 - thislen;
            } else {
@@ -71,7 +77,7 @@ impl<'a> BlockCursor<'a> {
            if page_remain == 0 {
                // continue on next page
                blknum += 1;
-                buf = self.read_blk(blknum).await?;
+                buf = self.read_blk(blknum, ctx).await?;
                off = 0;
                page_remain = PAGE_SZ;
            }
@@ -228,12 +234,13 @@ impl BlobWriter<false> {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::tenant::block_io::BlockReaderRef;
+    use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef};
    use rand::{Rng, SeedableRng};

    async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
        let temp_dir = tempfile::tempdir()?;
        let path = temp_dir.path().join("file");
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);

        // Write part (in block to drop the file)
        let mut offsets = Vec::new();
@@ -255,7 +262,7 @@ mod tests {
        let rdr = BlockReaderRef::VirtualFile(&file);
        let rdr = BlockCursor::new(rdr);
        for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
-            let blob_read = rdr.read_blob(*offset).await?;
+            let blob_read = rdr.read_blob(*offset, &ctx).await?;
            assert_eq!(
                blob, &blob_read,
                "mismatch for idx={idx} at offset={offset}"
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -4,6 +4,7 @@

 use super::ephemeral_file::EphemeralFile;
 use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
+use crate::context::RequestContext;
 use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
 use crate::virtual_file::VirtualFile;
 use bytes::Bytes;
@@ -82,12 +83,16 @@ pub(crate) enum BlockReaderRef<'a> {

 impl<'a> BlockReaderRef<'a> {
    #[inline(always)]
-    async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+    async fn read_blk(
+        &self,
+        blknum: u32,
+        ctx: &RequestContext,
+    ) -> Result<BlockLease, std::io::Error> {
        use BlockReaderRef::*;
        match self {
-            FileBlockReader(r) => r.read_blk(blknum).await,
-            EphemeralFile(r) => r.read_blk(blknum).await,
-            Adapter(r) => r.read_blk(blknum).await,
+            FileBlockReader(r) => r.read_blk(blknum, ctx).await,
+            EphemeralFile(r) => r.read_blk(blknum, ctx).await,
+            Adapter(r) => r.read_blk(blknum, ctx).await,
            #[cfg(test)]
            TestDisk(r) => r.read_blk(blknum),
            #[cfg(test)]
@@ -105,11 +110,13 @@ impl<'a> BlockReaderRef<'a> {
 ///
 /// ```no_run
 /// # use pageserver::tenant::block_io::{BlockReader, FileBlockReader};
+/// # use pageserver::context::RequestContext;
 /// # let reader: FileBlockReader = unimplemented!("stub");
+/// # let ctx: RequestContext = unimplemented!("stub");
 /// let cursor = reader.block_cursor();
-/// let buf = cursor.read_blk(1);
+/// let buf = cursor.read_blk(1, &ctx);
 /// // do stuff with 'buf'
-/// let buf = cursor.read_blk(2);
+/// let buf = cursor.read_blk(2, &ctx);
 /// // do stuff with 'buf'
 /// ```
 ///
@@ -134,8 +141,12 @@ impl<'a> BlockCursor<'a> {
    /// access to the contents of the page. (For the page cache, the
    /// lease object represents a lock on the buffer.)
    #[inline(always)]
-    pub async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
-        self.reader.read_blk(blknum).await
+    pub async fn read_blk(
+        &self,
+        blknum: u32,
+        ctx: &RequestContext,
+    ) -> Result<BlockLease, std::io::Error> {
+        self.reader.read_blk(blknum, ctx).await
    }
 }

@@ -169,11 +180,15 @@ impl FileBlockReader {
    /// Returns a "lease" object that can be used to
    /// access to the contents of the page. (For the page cache, the
    /// lease object represents a lock on the buffer.)
-    pub async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+    pub async fn read_blk(
+        &self,
+        blknum: u32,
+        ctx: &RequestContext,
+    ) -> Result<BlockLease, std::io::Error> {
        let cache = page_cache::get();
        loop {
            match cache
-                .read_immutable_buf(self.file_id, blknum)
+                .read_immutable_buf(self.file_id, blknum, ctx)
                .await
                .map_err(|e| {
                    std::io::Error::new(
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -26,7 +26,11 @@ use std::{cmp::Ordering, io, result};
 use thiserror::Error;
 use tracing::error;

-use crate::tenant::block_io::{BlockReader, BlockWriter};
+use crate::{
+    context::{DownloadBehavior, RequestContext},
+    task_mgr::TaskKind,
+    tenant::block_io::{BlockReader, BlockWriter},
+};

 // The maximum size of a value stored in the B-tree. 5 bytes is enough currently.
 pub const VALUE_SZ: usize = 5;
@@ -231,14 +235,19 @@ where
    ///
    /// Read the value for given key. Returns the value, or None if it doesn't exist.
    ///
-    pub async fn get(&self, search_key: &[u8; L]) -> Result<Option<u64>> {
+    pub async fn get(&self, search_key: &[u8; L], ctx: &RequestContext) -> Result<Option<u64>> {
        let mut result: Option<u64> = None;
-        self.visit(search_key, VisitDirection::Forwards, |key, value| {
-            if key == search_key {
-                result = Some(value);
-            }
-            false
-        })
+        self.visit(
+            search_key,
+            VisitDirection::Forwards,
+            |key, value| {
+                if key == search_key {
+                    result = Some(value);
+                }
+                false
+            },
+            ctx,
+        )
        .await?;
        Ok(result)
    }
@@ -253,6 +262,7 @@ where
        search_key: &[u8; L],
        dir: VisitDirection,
        mut visitor: V,
+        ctx: &RequestContext,
    ) -> Result<bool>
    where
        V: FnMut(&[u8], u64) -> bool,
@@ -262,7 +272,9 @@ where
        let block_cursor = self.reader.block_cursor();
        while let Some((node_blknum, opt_iter)) = stack.pop() {
            // Locate the node.
-            let node_buf = block_cursor.read_blk(self.start_blk + node_blknum).await?;
+            let node_buf = block_cursor
+                .read_blk(self.start_blk + node_blknum, ctx)
+                .await?;

            let node = OnDiskNode::deparse(node_buf.as_ref())?;
            let prefix_len = node.prefix_len as usize;
@@ -351,13 +363,14 @@ where
    #[allow(dead_code)]
    pub async fn dump(&self) -> Result<()> {
        let mut stack = Vec::new();
+        let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

        stack.push((self.root_blk, String::new(), 0, 0, 0));

        let block_cursor = self.reader.block_cursor();

        while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
-            let blk = block_cursor.read_blk(self.start_blk + blknum).await?;
+            let blk = block_cursor.read_blk(self.start_blk + blknum, &ctx).await?;
            let buf: &[u8] = blk.as_ref();
            let node = OnDiskNode::<L>::deparse(buf)?;

@@ -688,6 +701,8 @@ impl<const L: usize> BuildNode<L> {
 #[cfg(test)]
 pub(crate) mod tests {
    use super::*;
+    use crate::context::DownloadBehavior;
+    use crate::task_mgr::TaskKind;
    use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};
    use rand::Rng;
    use std::collections::BTreeMap;
@@ -725,6 +740,8 @@ pub(crate) mod tests {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);

+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+
        let all_keys: Vec<&[u8; 6]> = vec![
            b"xaaaaa", b"xaaaba", b"xaaaca", b"xabaaa", b"xababa", b"xabaca", b"xabada", b"xabadb",
        ];
@@ -745,12 +762,12 @@ pub(crate) mod tests {

        // Test the `get` function on all the keys.
        for (key, val) in all_data.iter() {
-            assert_eq!(reader.get(key).await?, Some(*val));
+            assert_eq!(reader.get(key, &ctx).await?, Some(*val));
        }
        // And on some keys that don't exist
-        assert_eq!(reader.get(b"aaaaaa").await?, None);
-        assert_eq!(reader.get(b"zzzzzz").await?, None);
-        assert_eq!(reader.get(b"xaaabx").await?, None);
+        assert_eq!(reader.get(b"aaaaaa", &ctx).await?, None);
+        assert_eq!(reader.get(b"zzzzzz", &ctx).await?, None);
+        assert_eq!(reader.get(b"xaaabx", &ctx).await?, None);

        // Test search with `visit` function
        let search_key = b"xabaaa";
@@ -762,10 +779,15 @@ pub(crate) mod tests {

        let mut data = Vec::new();
        reader
-            .visit(search_key, VisitDirection::Forwards, |key, value| {
-                data.push((key.to_vec(), value));
-                true
-            })
+            .visit(
+                search_key,
+                VisitDirection::Forwards,
+                |key, value| {
+                    data.push((key.to_vec(), value));
+                    true
+                },
+                &ctx,
+            )
            .await?;
        assert_eq!(data, expected);

@@ -778,18 +800,28 @@ pub(crate) mod tests {
        expected.reverse();
        let mut data = Vec::new();
        reader
-            .visit(search_key, VisitDirection::Backwards, |key, value| {
-                data.push((key.to_vec(), value));
-                true
-            })
+            .visit(
+                search_key,
+                VisitDirection::Backwards,
+                |key, value| {
+                    data.push((key.to_vec(), value));
+                    true
+                },
+                &ctx,
+            )
            .await?;
        assert_eq!(data, expected);

        // Backward scan where nothing matches
        reader
-            .visit(b"aaaaaa", VisitDirection::Backwards, |key, value| {
-                panic!("found unexpected key {}: {}", hex::encode(key), value);
-            })
+            .visit(
+                b"aaaaaa",
+                VisitDirection::Backwards,
+                |key, value| {
+                    panic!("found unexpected key {}: {}", hex::encode(key), value);
+                },
+                &ctx,
+            )
            .await?;

        // Full scan
@@ -799,10 +831,15 @@ pub(crate) mod tests {
            .collect();
        let mut data = Vec::new();
        reader
-            .visit(&[0u8; 6], VisitDirection::Forwards, |key, value| {
-                data.push((key.to_vec(), value));
-                true
-            })
+            .visit(
+                &[0u8; 6],
+                VisitDirection::Forwards,
+                |key, value| {
+                    data.push((key.to_vec(), value));
+                    true
+                },
+                &ctx,
+            )
            .await?;
        assert_eq!(data, expected);

@@ -813,6 +850,7 @@ pub(crate) mod tests {
    async fn lots_of_keys() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);

        const NUM_KEYS: u64 = 1000;

@@ -851,14 +889,14 @@ pub(crate) mod tests {
        for search_key_int in 0..(NUM_KEYS * 2 + 10) {
            let search_key = u64::to_be_bytes(search_key_int);
            assert_eq!(
-                reader.get(&search_key).await?,
+                reader.get(&search_key, &ctx).await?,
                all_data.get(&search_key_int).cloned()
            );

            // Test a forward scan starting with this key
            result.lock().unwrap().clear();
            reader
-                .visit(&search_key, VisitDirection::Forwards, take_ten)
+                .visit(&search_key, VisitDirection::Forwards, take_ten, &ctx)
                .await?;
            let expected = all_data
                .range(search_key_int..)
@@ -870,7 +908,7 @@ pub(crate) mod tests {
            // And a backwards scan
            result.lock().unwrap().clear();
            reader
-                .visit(&search_key, VisitDirection::Backwards, take_ten)
+                .visit(&search_key, VisitDirection::Backwards, take_ten, &ctx)
                .await?;
            let expected = all_data
                .range(..=search_key_int)
@@ -886,7 +924,7 @@ pub(crate) mod tests {
        limit.store(usize::MAX, Ordering::Relaxed);
        result.lock().unwrap().clear();
        reader
-            .visit(&search_key, VisitDirection::Forwards, take_ten)
+            .visit(&search_key, VisitDirection::Forwards, take_ten, &ctx)
            .await?;
        let expected = all_data
            .iter()
@@ -899,7 +937,7 @@ pub(crate) mod tests {
        limit.store(usize::MAX, Ordering::Relaxed);
        result.lock().unwrap().clear();
        reader
-            .visit(&search_key, VisitDirection::Backwards, take_ten)
+            .visit(&search_key, VisitDirection::Backwards, take_ten, &ctx)
            .await?;
        let expected = all_data
            .iter()
@@ -913,6 +951,8 @@ pub(crate) mod tests {

    #[tokio::test]
    async fn random_data() -> Result<()> {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+
        // Generate random keys with exponential distribution, to
        // exercise the prefix compression
        const NUM_KEYS: usize = 100000;
@@ -939,22 +979,24 @@ pub(crate) mod tests {
        // Test get() operation on all the keys
        for (&key, &val) in all_data.iter() {
            let search_key = u128::to_be_bytes(key);
-            assert_eq!(reader.get(&search_key).await?, Some(val));
+            assert_eq!(reader.get(&search_key, &ctx).await?, Some(val));
        }

        // Test get() operations on random keys, most of which will not exist
        for _ in 0..100000 {
            let key_int = rand::thread_rng().gen::<u128>();
            let search_key = u128::to_be_bytes(key_int);
-            assert!(reader.get(&search_key).await? == all_data.get(&key_int).cloned());
+            assert!(reader.get(&search_key, &ctx).await? == all_data.get(&key_int).cloned());
        }

        // Test boundary cases
        assert!(
-            reader.get(&u128::to_be_bytes(u128::MIN)).await? == all_data.get(&u128::MIN).cloned()
+            reader.get(&u128::to_be_bytes(u128::MIN), &ctx).await?
+                == all_data.get(&u128::MIN).cloned()
        );
        assert!(
-            reader.get(&u128::to_be_bytes(u128::MAX)).await? == all_data.get(&u128::MAX).cloned()
+            reader.get(&u128::to_be_bytes(u128::MAX), &ctx).await?
+                == all_data.get(&u128::MAX).cloned()
        );

        Ok(())
@@ -985,6 +1027,7 @@ pub(crate) mod tests {
        // Build a tree from it
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);

        for (key, val) in disk_btree_test_data::TEST_DATA {
            writer.append(&key, val)?;
@@ -997,16 +1040,21 @@ pub(crate) mod tests {

        // Test get() operation on all the keys
        for (key, val) in disk_btree_test_data::TEST_DATA {
-            assert_eq!(reader.get(&key).await?, Some(val));
+            assert_eq!(reader.get(&key, &ctx).await?, Some(val));
        }

        // Test full scan
        let mut count = 0;
        reader
-            .visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| {
-                count += 1;
-                true
-            })
+            .visit(
+                &[0u8; 26],
+                VisitDirection::Forwards,
+                |_key, _value| {
+                    count += 1;
+                    true
+                },
+                &ctx,
+            )
            .await?;
        assert_eq!(count, disk_btree_test_data::TEST_DATA.len());

--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -2,6 +2,7 @@
 //! used to keep in-memory layers spilled on disk.

 use crate::config::PageServerConf;
+use crate::context::RequestContext;
 use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
 use crate::virtual_file::VirtualFile;
@@ -61,13 +62,17 @@ impl EphemeralFile {
        self.len
    }

-    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
+    pub(crate) async fn read_blk(
+        &self,
+        blknum: u32,
+        ctx: &RequestContext,
+    ) -> Result<BlockLease, io::Error> {
        let flushed_blknums = 0..self.len / PAGE_SZ as u64;
        if flushed_blknums.contains(&(blknum as u64)) {
            let cache = page_cache::get();
            loop {
                match cache
-                    .read_immutable_buf(self.page_cache_file_id, blknum)
+                    .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
                    .await
                    .map_err(|e| {
                        std::io::Error::new(
@@ -103,7 +108,11 @@ impl EphemeralFile {
        }
    }

-    pub(crate) async fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
+    pub(crate) async fn write_blob(
+        &mut self,
+        srcbuf: &[u8],
+        ctx: &RequestContext,
+    ) -> Result<u64, io::Error> {
        struct Writer<'a> {
            ephemeral_file: &'a mut EphemeralFile,
            /// The block to which the next [`push_bytes`] will write.
@@ -120,7 +129,11 @@ impl EphemeralFile {
                })
            }
            #[inline(always)]
-            async fn push_bytes(&mut self, src: &[u8]) -> Result<(), io::Error> {
+            async fn push_bytes(
+                &mut self,
+                src: &[u8],
+                ctx: &RequestContext,
+            ) -> Result<(), io::Error> {
                let mut src_remaining = src;
                while !src_remaining.is_empty() {
                    let dst_remaining = &mut self.ephemeral_file.mutable_tail[self.off..];
@@ -146,6 +159,7 @@ impl EphemeralFile {
                                    .read_immutable_buf(
                                        self.ephemeral_file.page_cache_file_id,
                                        self.blknum,
+                                        ctx,
                                    )
                                    .await
                                {
@@ -199,15 +213,15 @@ impl EphemeralFile {
        if srcbuf.len() < 0x80 {
            // short one-byte length header
            let len_buf = [srcbuf.len() as u8];
-            writer.push_bytes(&len_buf).await?;
+            writer.push_bytes(&len_buf, ctx).await?;
        } else {
            let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
            len_buf[0] |= 0x80;
-            writer.push_bytes(&len_buf).await?;
+            writer.push_bytes(&len_buf, ctx).await?;
        }

        // Write the payload
-        writer.push_bytes(srcbuf).await?;
+        writer.push_bytes(srcbuf, ctx).await?;

        if srcbuf.len() < 0x80 {
            self.len += 1;
@@ -261,6 +275,8 @@ impl BlockReader for EphemeralFile {
 #[cfg(test)]
 mod tests {
    use super::*;
+    use crate::context::DownloadBehavior;
+    use crate::task_mgr::TaskKind;
    use crate::tenant::block_io::{BlockCursor, BlockReaderRef};
    use rand::{thread_rng, RngCore};
    use std::fs;
@@ -268,7 +284,15 @@ mod tests {

    fn harness(
        test_name: &str,
-    ) -> Result<(&'static PageServerConf, TenantId, TimelineId), io::Error> {
+    ) -> Result<
+        (
+            &'static PageServerConf,
+            TenantId,
+            TimelineId,
+            RequestContext,
+        ),
+        io::Error,
+    > {
        let repo_dir = PageServerConf::test_repo_dir(test_name);
        let _ = fs::remove_dir_all(&repo_dir);
        let conf = PageServerConf::dummy_conf(repo_dir);
@@ -280,46 +304,57 @@ mod tests {
        let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
        fs::create_dir_all(conf.timeline_path(&tenant_id, &timeline_id))?;

-        Ok((conf, tenant_id, timeline_id))
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+
+        Ok((conf, tenant_id, timeline_id, ctx))
    }

    #[tokio::test]
    async fn test_ephemeral_blobs() -> Result<(), io::Error> {
-        let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;
+        let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;

        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id).await?;

-        let pos_foo = file.write_blob(b"foo").await?;
+        let pos_foo = file.write_blob(b"foo", &ctx).await?;
        assert_eq!(
            b"foo",
-            file.block_cursor().read_blob(pos_foo).await?.as_slice()
+            file.block_cursor()
+                .read_blob(pos_foo, &ctx)
+                .await?
+                .as_slice()
        );
-        let pos_bar = file.write_blob(b"bar").await?;
+        let pos_bar = file.write_blob(b"bar", &ctx).await?;
        assert_eq!(
            b"foo",
-            file.block_cursor().read_blob(pos_foo).await?.as_slice()
+            file.block_cursor()
+                .read_blob(pos_foo, &ctx)
+                .await?
+                .as_slice()
        );
        assert_eq!(
            b"bar",
-            file.block_cursor().read_blob(pos_bar).await?.as_slice()
+            file.block_cursor()
+                .read_blob(pos_bar, &ctx)
+                .await?
+                .as_slice()
        );

        let mut blobs = Vec::new();
        for i in 0..10000 {
            let data = Vec::from(format!("blob{}", i).as_bytes());
-            let pos = file.write_blob(&data).await?;
+            let pos = file.write_blob(&data, &ctx).await?;
            blobs.push((pos, data));
        }
        // also test with a large blobs
        for i in 0..100 {
            let data = format!("blob{}", i).as_bytes().repeat(100);
-            let pos = file.write_blob(&data).await?;
+            let pos = file.write_blob(&data, &ctx).await?;
            blobs.push((pos, data));
        }

        let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file));
        for (pos, expected) in blobs {
-            let actual = cursor.read_blob(pos).await?;
+            let actual = cursor.read_blob(pos, &ctx).await?;
            assert_eq!(actual, expected);
        }

@@ -327,8 +362,8 @@ mod tests {
        let mut large_data = Vec::new();
        large_data.resize(20000, 0);
        thread_rng().fill_bytes(&mut large_data);
-        let pos_large = file.write_blob(&large_data).await?;
-        let result = file.block_cursor().read_blob(pos_large).await?;
+        let pos_large = file.write_blob(&large_data, &ctx).await?;
+        let result = file.block_cursor().read_blob(pos_large, &ctx).await?;
        assert_eq!(result, large_data);

        Ok(())
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -20,7 +20,10 @@ use utils::crashsafe;

 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::control_plane_client::ControlPlaneClient;
+use crate::control_plane_client::{
+    ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
+};
+use crate::deletion_queue::DeletionQueueClient;
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::delete::DeleteTenantFlow;
@@ -116,7 +119,23 @@ pub async fn init_tenant_mgr(

    // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
    let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf, &cancel) {
-        Some(client.re_attach().await?)
+        let result = match client.re_attach().await {
+            Ok(tenants) => tenants,
+            Err(RetryForeverError::ShuttingDown) => {
+                anyhow::bail!("Shut down while waiting for control plane re-attach response")
+            }
+        };
+
+        // The deletion queue needs to know about the startup attachment state to decide which (if any) stored
+        // deletion list entries may still be valid.  We provide that by pushing a recovery operation into
+        // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
+        // are processed, even though we don't block on recovery completing here.
+        resources
+            .deletion_queue_client
+            .recover(result.clone())
+            .await?;
+
+        Some(result)
    } else {
        info!("Control plane API not configured, tenant generations are disabled");
        None
@@ -285,29 +304,21 @@ pub(crate) fn schedule_local_tenant_processing(

    let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
        info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
-        if let Some(remote_storage) = resources.remote_storage {
-            match Tenant::spawn_attach(
-                conf,
-                tenant_id,
-                generation,
-                resources.broker_client,
-                tenants,
-                remote_storage,
-                ctx,
-            ) {
-                Ok(tenant) => tenant,
-                Err(e) => {
-                    error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
-                    Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
-                }
-            }
-        } else {
+        if resources.remote_storage.is_none() {
            warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
            Tenant::create_broken_tenant(
                conf,
                tenant_id,
                "attaching mark file present but no remote storage configured".to_string(),
            )
+        } else {
+            match Tenant::spawn_attach(conf, tenant_id, generation, resources, tenants, ctx) {
+                Ok(tenant) => tenant,
+                Err(e) => {
+                    error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
+                    Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
+                }
+            }
        }
    } else {
        info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
@@ -438,8 +449,7 @@ pub async fn create_tenant(
    tenant_conf: TenantConfOpt,
    tenant_id: TenantId,
    generation: Generation,
-    broker_client: storage_broker::BrokerClientChannel,
-    remote_storage: Option<GenericRemoteStorage>,
+    resources: TenantSharedResources,
    ctx: &RequestContext,
 ) -> Result<Arc<Tenant>, TenantMapInsertError> {
    tenant_map_insert(tenant_id, || async {
@@ -450,13 +460,9 @@ pub async fn create_tenant(
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233

-        let tenant_resources = TenantSharedResources {
-            broker_client,
-            remote_storage,
-        };
        let created_tenant =
            schedule_local_tenant_processing(conf, tenant_id, &tenant_directory,
-                generation, tenant_resources, None, &TENANTS, ctx)?;
+                generation, resources, None, &TENANTS, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

@@ -622,6 +628,7 @@ pub async fn load_tenant(
    generation: Generation,
    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
+    deletion_queue_client: DeletionQueueClient,
    ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
    tenant_map_insert(tenant_id, || async {
@@ -635,6 +642,7 @@ pub async fn load_tenant(
        let resources = TenantSharedResources {
            broker_client,
            remote_storage,
+            deletion_queue_client
        };
        let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, generation, resources, None,  &TENANTS, ctx)
            .with_context(|| {
@@ -702,8 +710,7 @@ pub async fn attach_tenant(
    tenant_id: TenantId,
    generation: Generation,
    tenant_conf: TenantConfOpt,
-    broker_client: storage_broker::BrokerClientChannel,
-    remote_storage: GenericRemoteStorage,
+    resources: TenantSharedResources,
    ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
    tenant_map_insert(tenant_id, || async {
@@ -718,10 +725,7 @@ pub async fn attach_tenant(
            .context("check for attach marker file existence")?;
        anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");

-        let resources = TenantSharedResources {
-            broker_client,
-            remote_storage: Some(remote_storage),
-        };
+
        let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, generation, resources, None, &TENANTS, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -116,8 +116,12 @@
 //! # Completion
 //!
 //! Once an operation has completed, we update
-//! [`UploadQueueInitialized::last_uploaded_consistent_lsn`] which indicates
-//! to safekeepers that they can delete the WAL up to that LSN.
+//! [`UploadQueueInitialized::projected_remote_consistent_lsn`] immediately,
+//! and submit a request through the DeletionQueue to update
+//! [`UploadQueueInitialized::visible_remote_consistent_lsn`] after it has
+//! validated that our generation is not stale.  It is this visible value
+//! that is advertized to safekeepers as a signal that that they can
+//! delete the WAL up to that LSN.
 //!
 //! The [`RemoteTimelineClient::wait_completion`] method can be used to wait
 //! for all pending operations to complete. It does not prevent more
@@ -200,7 +204,6 @@
 //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
 //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map

-mod delete;
 mod download;
 pub mod index;
 mod upload;
@@ -226,6 +229,7 @@ use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;

+use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{
    MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
    RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
@@ -324,6 +328,8 @@ pub struct RemoteTimelineClient {
    metrics: Arc<RemoteTimelineClientMetrics>,

    storage_impl: GenericRemoteStorage,
+
+    deletion_queue_client: DeletionQueueClient,
 }

 impl RemoteTimelineClient {
@@ -335,6 +341,7 @@ impl RemoteTimelineClient {
    ///
    pub fn new(
        remote_storage: GenericRemoteStorage,
+        deletion_queue_client: DeletionQueueClient,
        conf: &'static PageServerConf,
        tenant_id: TenantId,
        timeline_id: TimelineId,
@@ -352,6 +359,7 @@ impl RemoteTimelineClient {
            timeline_id,
            generation,
            storage_impl: remote_storage,
+            deletion_queue_client,
            upload_queue: Mutex::new(UploadQueue::Uninitialized),
            metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
        }
@@ -413,13 +421,24 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    pub fn last_uploaded_consistent_lsn(&self) -> Option<Lsn> {
-        match &*self.upload_queue.lock().unwrap() {
+    pub fn remote_consistent_lsn_projected(&self) -> Option<Lsn> {
+        match &mut *self.upload_queue.lock().unwrap() {
            UploadQueue::Uninitialized => None,
-            UploadQueue::Initialized(q) => Some(q.last_uploaded_consistent_lsn),
-            UploadQueue::Stopped(q) => {
-                Some(q.upload_queue_for_deletion.last_uploaded_consistent_lsn)
-            }
+            UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(),
+            UploadQueue::Stopped(q) => q
+                .upload_queue_for_deletion
+                .get_last_remote_consistent_lsn_projected(),
+        }
+    }
+
+    pub fn remote_consistent_lsn_visible(&self) -> Option<Lsn> {
+        match &mut *self.upload_queue.lock().unwrap() {
+            UploadQueue::Uninitialized => None,
+            UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()),
+            UploadQueue::Stopped(q) => Some(
+                q.upload_queue_for_deletion
+                    .get_last_remote_consistent_lsn_visible(),
+            ),
        }
    }

@@ -434,11 +453,11 @@ impl RemoteTimelineClient {
        } else {
            0
        };
-        self.metrics.remote_physical_size_gauge().set(size);
+        self.metrics.remote_physical_size_set(size);
    }

    pub fn get_remote_physical_size(&self) -> u64 {
-        self.metrics.remote_physical_size_gauge().get()
+        self.metrics.remote_physical_size_get()
    }

    //
@@ -643,7 +662,7 @@ impl RemoteTimelineClient {
    /// successfully.
    pub fn schedule_layer_file_deletion(
        self: &Arc<Self>,
-        names: &[LayerFileName],
+        names: Vec<LayerFileName>,
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;
@@ -663,10 +682,10 @@ impl RemoteTimelineClient {
            // Decorate our list of names with each name's generation, dropping
            // makes that are unexpectedly missing from our metadata.
            let with_generations: Vec<_> = names
-                .iter()
+                .into_iter()
                .filter_map(|name| {
                    // Remove from latest_files, learning the file's remote generation in the process
-                    let meta = upload_queue.latest_files.remove(name);
+                    let meta = upload_queue.latest_files.remove(&name);

                    if let Some(meta) = meta {
                        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
@@ -688,19 +707,17 @@ impl RemoteTimelineClient {
                self.schedule_index_upload(upload_queue, metadata);
            }

-            // schedule the actual deletions
-            for (name, generation) in with_generations {
-                let op = UploadOp::Delete(Delete {
-                    file_kind: RemoteOpFileKind::Layer,
-                    layer_file_name: name.clone(),
-                    scheduled_from_timeline_delete: false,
-                    generation,
-                });
-                self.calls_unfinished_metric_begin(&op);
-                upload_queue.queued_operations.push_back(op);
-                info!("scheduled layer file deletion {name}");
+            for (name, gen) in &with_generations {
+                info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
            }

+            // schedule the actual deletions
+            let op = UploadOp::Delete(Delete {
+                layers: with_generations,
+            });
+            self.calls_unfinished_metric_begin(&op);
+            upload_queue.queued_operations.push_back(op);
+
            // Launch the tasks immediately, if possible
            self.launch_queued_tasks(upload_queue);
        };
@@ -833,9 +850,7 @@ impl RemoteTimelineClient {
    pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
        debug_assert_current_span_has_tenant_and_timeline_id();

-        let (mut receiver, deletions_queued) = {
-            let mut deletions_queued = 0;
-
+        let layers: Vec<RemotePath> = {
            let mut locked = self.upload_queue.lock().unwrap();
            let stopped = locked.stopped_mut()?;

@@ -847,42 +862,30 @@ impl RemoteTimelineClient {

            stopped
                .upload_queue_for_deletion
-                .queued_operations
-                .reserve(stopped.upload_queue_for_deletion.latest_files.len());
-
-            // schedule the actual deletions
-            for (name, meta) in &stopped.upload_queue_for_deletion.latest_files {
-                let op = UploadOp::Delete(Delete {
-                    file_kind: RemoteOpFileKind::Layer,
-                    layer_file_name: name.clone(),
-                    scheduled_from_timeline_delete: true,
-                    generation: meta.generation,
-                });
-
-                self.calls_unfinished_metric_begin(&op);
-                stopped
-                    .upload_queue_for_deletion
-                    .queued_operations
-                    .push_back(op);
-
-                info!("scheduled layer file deletion {name}");
-                deletions_queued += 1;
-            }
-
-            self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion);
-
-            (
-                self.schedule_barrier(&mut stopped.upload_queue_for_deletion),
-                deletions_queued,
-            )
+                .latest_files
+                .drain()
+                .map(|(file_name, meta)| {
+                    remote_layer_path(
+                        &self.tenant_id,
+                        &self.timeline_id,
+                        &file_name,
+                        meta.generation,
+                    )
+                })
+                .collect()
        };

-        receiver.changed().await.context("upload queue shut down")?;
+        let layer_deletion_count = layers.len();
+        self.deletion_queue_client.push_immediate(layers).await?;

        // Do not delete index part yet, it is needed for possible retry. If we remove it first
        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
        let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id);

+        // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
+        // taking the burden of listing all the layers that we already know we should delete.
+        self.deletion_queue_client.flush_immediate().await?;
+
        let remaining = backoff::retry(
            || async {
                self.storage_impl
@@ -910,17 +913,9 @@ impl RemoteTimelineClient {
            })
            .collect();

+        let not_referenced_count = remaining.len();
        if !remaining.is_empty() {
-            backoff::retry(
-                || async { self.storage_impl.delete_objects(&remaining).await },
-                |_e| false,
-                FAILED_UPLOAD_WARN_THRESHOLD,
-                FAILED_REMOTE_OP_RETRIES,
-                "delete_objects",
-                backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
-            )
-            .await
-            .context("delete_objects")?;
+            self.deletion_queue_client.push_immediate(remaining).await?;
        }

        fail::fail_point!("timeline-delete-before-index-delete", |_| {
@@ -931,18 +926,14 @@ impl RemoteTimelineClient {

        let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));

-        debug!("deleting index part");
+        debug!("enqueuing index part deletion");
+        self.deletion_queue_client
+            .push_immediate([index_file_path].to_vec())
+            .await?;

-        backoff::retry(
-            || async { self.storage_impl.delete(&index_file_path).await },
-            |_e| false,
-            FAILED_UPLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
-            "delete_index",
-            backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled")),
-        )
-        .await
-        .context("delete_index")?;
+        // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
+        // for a flush to a persistent deletion list so that we may be sure deletion will occur.
+        self.deletion_queue_client.flush_immediate().await?;

        fail::fail_point!("timeline-delete-after-index-delete", |_| {
            Err(anyhow::anyhow!(
@@ -950,7 +941,7 @@ impl RemoteTimelineClient {
            ))?
        });

-        info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");
+        info!(prefix=%timeline_storage_path, referenced=layer_deletion_count, not_referenced=%not_referenced_count, "done deleting in timeline prefix, including index_part.json");

        Ok(())
    }
@@ -1140,21 +1131,16 @@ impl RemoteTimelineClient {
                    }
                    res
                }
-                UploadOp::Delete(delete) => {
-                    let path = &self
-                        .conf
-                        .timeline_path(&self.tenant_id, &self.timeline_id)
-                        .join(delete.layer_file_name.file_name());
-                    delete::delete_layer(self.conf, &self.storage_impl, path, delete.generation)
-                        .measure_remote_op(
-                            self.tenant_id,
-                            self.timeline_id,
-                            delete.file_kind,
-                            RemoteOpKind::Delete,
-                            Arc::clone(&self.metrics),
-                        )
-                        .await
-                }
+                UploadOp::Delete(delete) => self
+                    .deletion_queue_client
+                    .push_layers(
+                        self.tenant_id,
+                        self.timeline_id,
+                        self.generation,
+                        delete.layers.clone(),
+                    )
+                    .await
+                    .map_err(|e| anyhow::anyhow!(e)),
                UploadOp::Barrier(_) => {
                    // unreachable. Barrier operations are handled synchronously in
                    // launch_queued_tasks
@@ -1210,18 +1196,12 @@ impl RemoteTimelineClient {
        }

        // The task has completed successfully. Remove it from the in-progress list.
-        {
+        let lsn_update = {
            let mut upload_queue_guard = self.upload_queue.lock().unwrap();
            let upload_queue = match upload_queue_guard.deref_mut() {
                UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"),
-                UploadQueue::Stopped(stopped) => {
-                    // Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion)
-                    // then stop() took care of it so we just return.
-                    // For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc.
-                    match &task.op {
-                        UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion),
-                        _ => None
-                    }
+                UploadQueue::Stopped(_stopped) => {
+                    None
                },
                UploadQueue::Initialized(qi) => { Some(qi) }
            };
@@ -1236,23 +1216,51 @@ impl RemoteTimelineClient {

            upload_queue.inprogress_tasks.remove(&task.task_id);

-            match task.op {
+            let lsn_update = match task.op {
                UploadOp::UploadLayer(_, _) => {
                    upload_queue.num_inprogress_layer_uploads -= 1;
+                    None
                }
                UploadOp::UploadMetadata(_, lsn) => {
                    upload_queue.num_inprogress_metadata_uploads -= 1;
-                    upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check?
+                    // XXX monotonicity check?
+
+                    upload_queue.projected_remote_consistent_lsn = Some(lsn);
+                    if self.generation.is_none() {
+                        // Legacy mode: skip validating generation
+                        upload_queue.visible_remote_consistent_lsn.store(lsn);
+                        None
+                    } else {
+                        Some((lsn, upload_queue.visible_remote_consistent_lsn.clone()))
+                    }
                }
                UploadOp::Delete(_) => {
                    upload_queue.num_inprogress_deletions -= 1;
+                    None
                }
                UploadOp::Barrier(_) => unreachable!(),
            };

            // Launch any queued tasks that were unblocked by this one.
            self.launch_queued_tasks(upload_queue);
+            lsn_update
+        };
+
+        if let Some((lsn, slot)) = lsn_update {
+            // Updates to the remote_consistent_lsn we advertise to pageservers
+            // are all routed through the DeletionQueue, to enforce important
+            // data safety guarantees (see docs/rfcs/025-generation-numbers.md)
+            self.deletion_queue_client
+                .update_remote_consistent_lsn(
+                    self.tenant_id,
+                    self.timeline_id,
+                    self.generation,
+                    lsn,
+                    slot,
+                )
+                .await;
        }
+
        self.calls_unfinished_metric_end(&task.op);
    }

@@ -1278,8 +1286,8 @@ impl RemoteTimelineClient {
                    reason: "metadata uploads are tiny",
                },
            ),
-            UploadOp::Delete(delete) => (
-                delete.file_kind,
+            UploadOp::Delete(_delete) => (
+                RemoteOpFileKind::Layer,
                RemoteOpKind::Delete,
                DontTrackSize {
                    reason: "should we track deletes? positive or negative sign?",
@@ -1341,7 +1349,10 @@ impl RemoteTimelineClient {
                        latest_files: initialized.latest_files.clone(),
                        latest_files_changes_since_metadata_upload_scheduled: 0,
                        latest_metadata: initialized.latest_metadata.clone(),
-                        last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn,
+                        projected_remote_consistent_lsn: None,
+                        visible_remote_consistent_lsn: initialized
+                            .visible_remote_consistent_lsn
+                            .clone(),
                        num_inprogress_layer_uploads: 0,
                        num_inprogress_metadata_uploads: 0,
                        num_inprogress_deletions: 0,
@@ -1405,13 +1416,13 @@ pub fn remote_layer_path(
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
    layer_file_name: &LayerFileName,
-    layer_meta: &LayerFileMetadata,
+    generation: Generation,
 ) -> RemotePath {
    // Generation-aware key format
    let path = format!(
        "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
        layer_file_name.file_name(),
-        layer_meta.generation.get_suffix()
+        generation.get_suffix()
    );

    RemotePath::from_string(&path).expect("Failed to construct path")
@@ -1554,7 +1565,6 @@ mod tests {

    impl TestSetup {
        async fn new(test_name: &str) -> anyhow::Result<Self> {
-            // Use a current-thread runtime in the test
            let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
            let harness = TenantHarness::create(test_name)?;
            let (tenant, ctx) = harness.load().await;
@@ -1580,6 +1590,7 @@ mod tests {
                timeline_id: TIMELINE_ID,
                generation,
                storage_impl: self.harness.remote_storage.clone(),
+                deletion_queue_client: self.harness.deletion_queue.new_client(),
                upload_queue: Mutex::new(UploadQueue::Uninitialized),
                metrics: Arc::new(RemoteTimelineClientMetrics::new(
                    &self.harness.tenant_id,
@@ -1749,7 +1760,7 @@ mod tests {
            )
            .unwrap();
        client
-            .schedule_layer_file_deletion(&[layer_file_name_1.clone()])
+            .schedule_layer_file_deletion([layer_file_name_1.clone()].to_vec())
            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
@@ -1775,6 +1786,7 @@ mod tests {

        // Finish them
        client.wait_completion().await.unwrap();
+        harness.deletion_queue.pump().await;

        assert_remote_files(
            &[
--- a/pageserver/src/tenant/remote_timeline_client/delete.rs
+++ b/pageserver/src/tenant/remote_timeline_client/delete.rs
@@ -1,34 +0,0 @@
-//! Helper functions to delete files from remote storage with a RemoteStorage
-use anyhow::Context;
-use std::path::Path;
-use tracing::debug;
-
-use remote_storage::GenericRemoteStorage;
-
-use crate::{
-    config::PageServerConf,
-    tenant::{remote_timeline_client::remote_path, Generation},
-};
-
-pub(super) async fn delete_layer<'a>(
-    conf: &'static PageServerConf,
-    storage: &'a GenericRemoteStorage,
-    local_layer_path: &'a Path,
-    generation: Generation,
-) -> anyhow::Result<()> {
-    fail::fail_point!("before-delete-layer", |_| {
-        anyhow::bail!("failpoint before-delete-layer")
-    });
-    debug!("Deleting layer from remote storage: {local_layer_path:?}",);
-
-    let path_to_delete = remote_path(conf, local_layer_path, generation)?;
-
-    // We don't want to print an error if the delete failed if the file has
-    // already been deleted. Thankfully, in this situation S3 already
-    // does not yield an error. While OS-provided local file system APIs do yield
-    // errors, we avoid them in the `LocalFs` wrapper.
-    storage
-        .delete(&path_to_delete)
-        .await
-        .with_context(|| format!("delete remote layer from storage at {path_to_delete:?}"))
-}
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -50,7 +50,12 @@ pub async fn download_layer_file<'a>(
        .timeline_path(&tenant_id, &timeline_id)
        .join(layer_file_name.file_name());

-    let remote_path = remote_layer_path(&tenant_id, &timeline_id, layer_file_name, layer_metadata);
+    let remote_path = remote_layer_path(
+        &tenant_id,
+        &timeline_id,
+        layer_file_name,
+        layer_metadata.generation,
+    );

    // Perform a rename inspired by durable_rename from file_utils.c.
    // The sequence:
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -28,7 +28,7 @@
 //! "values" part.
 //!
 use crate::config::PageServerConf;
-use crate::context::RequestContext;
+use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
@@ -317,11 +317,11 @@ impl DeltaLayer {

        tree_reader.dump().await?;

-        let keys = DeltaLayerInner::load_keys(&inner).await?;
+        let keys = DeltaLayerInner::load_keys(&inner, ctx).await?;

        // A subroutine to dump a single blob
-        async fn dump_blob(val: ValueRef<'_>) -> Result<String> {
-            let buf = val.reader.read_blob(val.blob_ref.pos()).await?;
+        async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> Result<String> {
+            let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
            let val = Value::des(&buf)?;
            let desc = match val {
                Value::Image(img) => {
@@ -342,7 +342,7 @@ impl DeltaLayer {

        for entry in keys {
            let DeltaEntry { key, lsn, val, .. } = entry;
-            let desc = match dump_blob(val).await {
+            let desc = match dump_blob(val, ctx).await {
                Ok(desc) => desc,
                Err(err) => {
                    let err: anyhow::Error = err;
@@ -370,7 +370,7 @@ impl DeltaLayer {
            .load(LayerAccessKind::GetValueReconstructData, ctx)
            .await?;
        inner
-            .get_value_reconstruct_data(key, lsn_range, reconstruct_state)
+            .get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
            .await
    }

@@ -453,12 +453,12 @@ impl DeltaLayer {
        self.access_stats.record_access(access_kind, ctx);
        // Quick exit if already loaded
        self.inner
-            .get_or_try_init(|| self.load_inner())
+            .get_or_try_init(|| self.load_inner(ctx))
            .await
            .with_context(|| format!("Failed to load delta layer {}", self.path().display()))
    }

-    async fn load_inner(&self) -> Result<Arc<DeltaLayerInner>> {
+    async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
        let path = self.path();

        let summary = match &self.path_or_conf {
@@ -466,7 +466,7 @@ impl DeltaLayer {
            PathOrConf::Path(_) => None,
        };

-        let loaded = DeltaLayerInner::load(&path, summary).await?;
+        let loaded = DeltaLayerInner::load(&path, summary, ctx).await?;

        if let PathOrConf::Path(ref path) = self.path_or_conf {
            // not production code
@@ -554,7 +554,7 @@ impl DeltaLayer {
            .load(LayerAccessKind::KeyIter, ctx)
            .await
            .context("load delta layer keys")?;
-        DeltaLayerInner::load_keys(inner)
+        DeltaLayerInner::load_keys(inner, ctx)
            .await
            .context("Layer index is corrupted")
    }
@@ -849,13 +849,14 @@ impl DeltaLayerInner {
    pub(super) async fn load(
        path: &std::path::Path,
        summary: Option<Summary>,
+        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
        let file = VirtualFile::open(path)
            .await
            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
        let file = FileBlockReader::new(file);

-        let summary_blk = file.read_blk(0).await?;
+        let summary_blk = file.read_blk(0, ctx).await?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

        if let Some(mut expected_summary) = summary {
@@ -883,6 +884,7 @@ impl DeltaLayerInner {
        key: Key,
        lsn_range: Range<Lsn>,
        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
    ) -> anyhow::Result<ValueReconstructResult> {
        let mut need_image = true;
        // Scan the page versions backwards, starting from `lsn`.
@@ -897,27 +899,38 @@ impl DeltaLayerInner {
        let mut offsets: Vec<(Lsn, u64)> = Vec::new();

        tree_reader
-            .visit(&search_key.0, VisitDirection::Backwards, |key, value| {
-                let blob_ref = BlobRef(value);
-                if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
-                    return false;
-                }
-                let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
-                if entry_lsn < lsn_range.start {
-                    return false;
-                }
-                offsets.push((entry_lsn, blob_ref.pos()));
+            .visit(
+                &search_key.0,
+                VisitDirection::Backwards,
+                |key, value| {
+                    let blob_ref = BlobRef(value);
+                    if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
+                        return false;
+                    }
+                    let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
+                    if entry_lsn < lsn_range.start {
+                        return false;
+                    }
+                    offsets.push((entry_lsn, blob_ref.pos()));

-                !blob_ref.will_init()
-            })
+                    !blob_ref.will_init()
+                },
+                &RequestContextBuilder::extend(ctx)
+                    .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
+                    .build(),
+            )
            .await?;

+        let ctx = &RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::DeltaLayerValue)
+            .build();
+
        // Ok, 'offsets' now contains the offsets of all the entries we need to read
        let cursor = file.block_cursor();
        let mut buf = Vec::new();
        for (entry_lsn, pos) in offsets {
            cursor
-                .read_blob_into_buf(pos, &mut buf)
+                .read_blob_into_buf(pos, &mut buf, ctx)
                .await
                .with_context(|| {
                    format!(
@@ -958,9 +971,10 @@ impl DeltaLayerInner {
        }
    }

-    pub(super) async fn load_keys<T: AsRef<DeltaLayerInner> + Clone>(
-        this: &T,
-    ) -> Result<Vec<DeltaEntry<'_>>> {
+    pub(super) async fn load_keys<'a, 'b, T: AsRef<DeltaLayerInner> + Clone>(
+        this: &'a T,
+        ctx: &'b RequestContext,
+    ) -> Result<Vec<DeltaEntry<'a>>> {
        let dl = this.as_ref();
        let file = &dl.file;

@@ -997,6 +1011,9 @@ impl DeltaLayerInner {
                    all_keys.push(entry);
                    true
                },
+                &RequestContextBuilder::extend(ctx)
+                    .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
+                    .build(),
            )
            .await?;
        if let Some(last) = all_keys.last_mut() {
@@ -1026,9 +1043,9 @@ pub struct ValueRef<'a> {

 impl<'a> ValueRef<'a> {
    /// Loads the value from disk
-    pub async fn load(&self) -> Result<Value> {
+    pub async fn load(&self, ctx: &RequestContext) -> Result<Value> {
        // theoretically we *could* record an access time for each, but it does not really matter
-        let buf = self.reader.read_blob(self.blob_ref.pos()).await?;
+        let buf = self.reader.read_blob(self.blob_ref.pos(), ctx).await?;
        let val = Value::des(&buf)?;
        Ok(val)
    }
@@ -1037,7 +1054,11 @@ impl<'a> ValueRef<'a> {
 pub(crate) struct Adapter<T>(T);

 impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
-    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
-        self.0.as_ref().file.read_blk(blknum).await
+    pub(crate) async fn read_blk(
+        &self,
+        blknum: u32,
+        ctx: &RequestContext,
+    ) -> Result<BlockLease, std::io::Error> {
+        self.0.as_ref().file.read_blk(blknum, ctx).await
    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -24,7 +24,7 @@
 //! mapping from Key to an offset in the "values" part.  The
 //! actual page images are stored in the "values" part.
 use crate::config::PageServerConf;
-use crate::context::RequestContext;
+use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
@@ -237,10 +237,15 @@ impl ImageLayer {
        tree_reader.dump().await?;

        tree_reader
-            .visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
-                println!("key: {} offset {}", hex::encode(key), value);
-                true
-            })
+            .visit(
+                &[0u8; KEY_SIZE],
+                VisitDirection::Forwards,
+                |key, value| {
+                    println!("key: {} offset {}", hex::encode(key), value);
+                    true
+                },
+                ctx,
+            )
            .await?;

        Ok(())
@@ -261,7 +266,7 @@ impl ImageLayer {
            .load(LayerAccessKind::GetValueReconstructData, ctx)
            .await?;
        inner
-            .get_value_reconstruct_data(key, reconstruct_state)
+            .get_value_reconstruct_data(key, reconstruct_state, ctx)
            .await
            // FIXME: makes no sense to dump paths
            .with_context(|| format!("read {}", self.path().display()))
@@ -335,12 +340,12 @@ impl ImageLayer {
    ) -> Result<&ImageLayerInner> {
        self.access_stats.record_access(access_kind, ctx);
        self.inner
-            .get_or_try_init(|| self.load_inner())
+            .get_or_try_init(|| self.load_inner(ctx))
            .await
            .with_context(|| format!("Failed to load image layer {}", self.path().display()))
    }

-    async fn load_inner(&self) -> Result<ImageLayerInner> {
+    async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
        let path = self.path();

        let expected_summary = match &self.path_or_conf {
@@ -349,7 +354,8 @@ impl ImageLayer {
        };

        let loaded =
-            ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary).await?;
+            ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary, ctx)
+                .await?;

        if let PathOrConf::Path(ref path) = self.path_or_conf {
            // not production code
@@ -436,12 +442,13 @@ impl ImageLayerInner {
        path: &std::path::Path,
        lsn: Lsn,
        summary: Option<Summary>,
+        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
        let file = VirtualFile::open(path)
            .await
            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
        let file = FileBlockReader::new(file);
-        let summary_blk = file.read_blk(0).await?;
+        let summary_blk = file.read_blk(0, ctx).await?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

        if let Some(mut expected_summary) = summary {
@@ -470,16 +477,30 @@ impl ImageLayerInner {
        &self,
        key: Key,
        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
    ) -> anyhow::Result<ValueReconstructResult> {
        let file = &self.file;
        let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);

        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
        key.write_to_byte_slice(&mut keybuf);
-        if let Some(offset) = tree_reader.get(&keybuf).await? {
+        if let Some(offset) = tree_reader
+            .get(
+                &keybuf,
+                &RequestContextBuilder::extend(ctx)
+                    .page_content_kind(PageContentKind::ImageLayerBtreeNode)
+                    .build(),
+            )
+            .await?
+        {
            let blob = file
                .block_cursor()
-                .read_blob(offset)
+                .read_blob(
+                    offset,
+                    &RequestContextBuilder::extend(ctx)
+                        .page_content_kind(PageContentKind::ImageLayerValue)
+                        .build(),
+                )
                .await
                .with_context(|| format!("failed to read value from offset {}", offset))?;
            let value = Bytes::from(blob);
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -5,7 +5,7 @@
 //! its position in the file, is kept in memory, though.
 //!
 use crate::config::PageServerConf;
-use crate::context::RequestContext;
+use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::repository::{Key, Value};
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
@@ -106,7 +106,7 @@ impl InMemoryLayer {
    /// debugging function to print out the contents of the layer
    ///
    /// this is likely completly unused
-    pub async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
+    pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        let inner = self.inner.read().await;

        let end_str = self.end_lsn_or_max();
@@ -125,7 +125,7 @@ impl InMemoryLayer {
        for (key, vec_map) in inner.index.iter() {
            for (lsn, pos) in vec_map.as_slice() {
                let mut desc = String::new();
-                cursor.read_blob_into_buf(*pos, &mut buf).await?;
+                cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
                let val = Value::des(&buf);
                match val {
                    Ok(Value::Image(img)) => {
@@ -158,11 +158,15 @@ impl InMemoryLayer {
        key: Key,
        lsn_range: Range<Lsn>,
        reconstruct_state: &mut ValueReconstructState,
-        _ctx: &RequestContext,
+        ctx: &RequestContext,
    ) -> anyhow::Result<ValueReconstructResult> {
        ensure!(lsn_range.start >= self.start_lsn);
        let mut need_image = true;

+        let ctx = RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::InMemoryLayer)
+            .build();
+
        let inner = self.inner.read().await;

        let reader = inner.file.block_cursor();
@@ -171,7 +175,7 @@ impl InMemoryLayer {
        if let Some(vec_map) = inner.index.get(&key) {
            let slice = vec_map.slice_range(lsn_range);
            for (entry_lsn, pos) in slice.iter().rev() {
-                let buf = reader.read_blob(*pos).await?;
+                let buf = reader.read_blob(*pos, &ctx).await?;
                let value = Value::des(&buf)?;
                match value {
                    Value::Image(img) => {
@@ -263,7 +267,13 @@ impl InMemoryLayer {

    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-    pub async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
+    pub async fn put_value(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        val: &Value,
+        ctx: &RequestContext,
+    ) -> Result<()> {
        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
        let inner: &mut _ = &mut *self.inner.write().await;
        self.assert_writable();
@@ -275,7 +285,15 @@ impl InMemoryLayer {
            let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
            buf.clear();
            val.ser_into(&mut buf)?;
-            inner.file.write_blob(&buf).await?
+            inner
+                .file
+                .write_blob(
+                    &buf,
+                    &RequestContextBuilder::extend(ctx)
+                        .page_content_kind(PageContentKind::InMemoryLayer)
+                        .build(),
+                )
+                .await?
        };

        let vec_map = inner.index.entry(key).or_default();
@@ -313,7 +331,7 @@ impl InMemoryLayer {
    /// Write this frozen in-memory layer to disk.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
-    pub(crate) async fn write_to_disk(&self) -> Result<DeltaLayer> {
+    pub(crate) async fn write_to_disk(&self, ctx: &RequestContext) -> Result<DeltaLayer> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -343,11 +361,14 @@ impl InMemoryLayer {
        let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
        keys.sort_by_key(|k| k.0);

+        let ctx = RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::InMemoryLayer)
+            .build();
        for (key, vec_map) in keys.iter() {
            let key = **key;
            // Write all page versions
            for (lsn, pos) in vec_map.as_slice() {
-                cursor.read_blob_into_buf(*pos, &mut buf).await?;
+                cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
                let will_init = Value::des(&buf)?.will_init();
                delta_layer_writer
                    .put_value_bytes(key, *lsn, &buf, will_init)
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -38,6 +38,7 @@ use std::time::{Duration, Instant, SystemTime};
 use crate::context::{
    AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
 };
+use crate::deletion_queue::DeletionQueueClient;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
@@ -143,6 +144,7 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
    pub remote_client: Option<RemoteTimelineClient>,
+    pub deletion_queue_client: DeletionQueueClient,
 }

 pub struct Timeline {
@@ -471,7 +473,7 @@ impl Timeline {
        // The cached image can be returned directly if there is no WAL between the cached image
        // and requested LSN. The cached image can also be used to reduce the amount of WAL needed
        // for redo.
-        let cached_page_img = match self.lookup_cached_page(&key, lsn).await {
+        let cached_page_img = match self.lookup_cached_page(&key, lsn, ctx).await {
            Some((cached_lsn, cached_img)) => {
                match cached_lsn.cmp(&lsn) {
                    Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
@@ -521,9 +523,23 @@ impl Timeline {
        self.disk_consistent_lsn.load()
    }

-    pub fn get_remote_consistent_lsn(&self) -> Option<Lsn> {
+    /// remote_consistent_lsn from the perspective of the tenant's current generation,
+    /// not validated with control plane yet.
+    /// See [`Self::get_remote_consistent_lsn_visible`].
+    pub fn get_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
        if let Some(remote_client) = &self.remote_client {
-            remote_client.last_uploaded_consistent_lsn()
+            remote_client.remote_consistent_lsn_projected()
+        } else {
+            None
+        }
+    }
+
+    /// remote_consistent_lsn which the tenant is guaranteed not to go backward from,
+    /// i.e. a value of remote_consistent_lsn_projected which has undergone
+    /// generation validation in the deletion queue.
+    pub fn get_remote_consistent_lsn_visible(&self) -> Option<Lsn> {
+        if let Some(remote_client) = &self.remote_client {
+            remote_client.remote_consistent_lsn_visible()
        } else {
            None
        }
@@ -543,7 +559,7 @@ impl Timeline {
    }

    pub fn resident_physical_size(&self) -> u64 {
-        self.metrics.resident_physical_size_gauge.get()
+        self.metrics.resident_physical_size_get()
    }

    ///
@@ -1293,10 +1309,7 @@ impl Timeline {
        // will treat the file as a local layer again, count it towards resident size,
        // and it'll be like the layer removal never happened.
        // The bump in resident size is perhaps unexpected but overall a robust behavior.
-        self.metrics
-            .resident_physical_size_gauge
-            .sub(layer_file_size);
-
+        self.metrics.resident_physical_size_sub(layer_file_size);
        self.metrics.evictions.inc();

        if let Some(delta) = local_layer_residence_duration {
@@ -1820,7 +1833,7 @@ impl Timeline {
            for (layer, m) in needs_upload {
                rtc.schedule_layer_file_upload(&layer.layer_desc().filename(), &m)?;
            }
-            rtc.schedule_layer_file_deletion(&needs_cleanup)?;
+            rtc.schedule_layer_file_deletion(needs_cleanup)?;
            rtc.schedule_index_upload_for_file_changes()?;
            // Tenant::create_timeline will wait for these uploads to happen before returning, or
            // on retry.
@@ -1830,9 +1843,7 @@ impl Timeline {
            "loaded layer map with {} layers at {}, total physical size: {}",
            num_layers, disk_consistent_lsn, total_physical_size
        );
-        self.metrics
-            .resident_physical_size_gauge
-            .set(total_physical_size);
+        self.metrics.resident_physical_size_set(total_physical_size);

        timer.stop_and_record();
        Ok(())
@@ -2518,13 +2529,18 @@ impl Timeline {
        }
    }

-    async fn lookup_cached_page(&self, key: &Key, lsn: Lsn) -> Option<(Lsn, Bytes)> {
+    async fn lookup_cached_page(
+        &self,
+        key: &Key,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Option<(Lsn, Bytes)> {
        let cache = page_cache::get();

        // FIXME: It's pointless to check the cache for things that are not 8kB pages.
        // We should look at the key to determine if it's a cacheable object
        let (lsn, read_guard) = cache
-            .lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn)
+            .lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn, ctx)
            .await?;
        let img = Bytes::from(read_guard.to_vec());
        Some((lsn, img))
@@ -2558,10 +2574,16 @@ impl Timeline {
        Ok(layer)
    }

-    async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> {
+    async fn put_value(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        val: &Value,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
        //info!("PUT: key {} at {}", key, lsn);
        let layer = self.get_layer_for_write(lsn).await?;
-        layer.put_value(key, lsn, val).await?;
+        layer.put_value(key, lsn, val, ctx).await?;
        Ok(())
    }

@@ -2733,7 +2755,7 @@ impl Timeline {
                // Normal case, write out a L0 delta layer file.
                // `create_delta_layer` will not modify the layer map.
                // We will remove frozen layer and add delta layer in one atomic operation later.
-                let layer = self.create_delta_layer(&frozen_layer).await?;
+                let layer = self.create_delta_layer(&frozen_layer, ctx).await?;
                (
                    HashMap::from([(
                        layer.filename(),
@@ -2856,19 +2878,21 @@ impl Timeline {
    async fn create_delta_layer(
        self: &Arc<Self>,
        frozen_layer: &Arc<InMemoryLayer>,
+        ctx: &RequestContext,
    ) -> anyhow::Result<DeltaLayer> {
        let span = tracing::info_span!("blocking");
        let new_delta: DeltaLayer = tokio::task::spawn_blocking({
            let _g = span.entered();
            let self_clone = Arc::clone(self);
            let frozen_layer = Arc::clone(frozen_layer);
+            let ctx = ctx.attached_child();
            move || {
                // Write it out
                // Keep this inside `spawn_blocking` and `Handle::current`
                // as long as the write path is still sync and the read impl
                // is still not fully async. Otherwise executor threads would
                // be blocked.
-                let new_delta = Handle::current().block_on(frozen_layer.write_to_disk())?;
+                let new_delta = Handle::current().block_on(frozen_layer.write_to_disk(&ctx))?;
                let new_delta_path = new_delta.path();

                // Sync it to disk.
@@ -3574,7 +3598,7 @@ impl Timeline {
            key, lsn, ref val, ..
        } in all_values_iter
        {
-            let value = val.load().await?;
+            let value = val.load(ctx).await?;
            let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
            // We need to check key boundaries once we reach next key or end of layer with the same key
            if !same_key || lsn == dup_end_lsn {
@@ -3862,7 +3886,7 @@ impl Timeline {

        // Also schedule the deletions in remote storage
        if let Some(remote_client) = &self.remote_client {
-            remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
+            remote_client.schedule_layer_file_deletion(layer_names_to_delete)?;
        }

        Ok(())
@@ -4197,7 +4221,7 @@ impl Timeline {
            }

            if let Some(remote_client) = &self.remote_client {
-                remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
+                remote_client.schedule_layer_file_deletion(layer_names_to_delete)?;
            }

            apply.flush();
@@ -4369,7 +4393,7 @@ impl Timeline {

                    // XXX the temp file is still around in Err() case
                    // and consumes space until we clean up upon pageserver restart.
-                    self_clone.metrics.resident_physical_size_gauge.add(*size);
+                    self_clone.metrics.resident_physical_size_add(*size);

                    // Download complete. Replace the RemoteLayer with the corresponding
                    // Delta- or ImageLayer in the layer map.
@@ -4699,8 +4723,14 @@ impl<'a> TimelineWriter<'a> {
    ///
    /// This will implicitly extend the relation, if the page is beyond the
    /// current end-of-file.
-    pub async fn put(&self, key: Key, lsn: Lsn, value: &Value) -> anyhow::Result<()> {
-        self.tl.put_value(key, lsn, value).await
+    pub async fn put(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        value: &Value,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        self.tl.put_value(key, lsn, value, ctx).await
    }

    pub async fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -14,6 +14,7 @@ use utils::{

 use crate::{
    config::PageServerConf,
+    deletion_queue::DeletionQueueClient,
    task_mgr::{self, TaskKind},
    tenant::{
        metadata::TimelineMetadata,
@@ -407,6 +408,7 @@ impl DeleteTimelineFlow {
        timeline_id: TimelineId,
        local_metadata: &TimelineMetadata,
        remote_client: Option<RemoteTimelineClient>,
+        deletion_queue_client: DeletionQueueClient,
        init_order: Option<&InitializationOrder>,
    ) -> anyhow::Result<()> {
        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
@@ -416,7 +418,10 @@ impl DeleteTimelineFlow {
                timeline_id,
                local_metadata,
                None, // Ancestor is not needed for deletion.
-                TimelineResources { remote_client },
+                TimelineResources {
+                    remote_client,
+                    deletion_queue_client,
+                },
                init_order,
                // Important. We dont pass ancestor above because it can be missing.
                // Thus we need to skip the validation here.
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -263,7 +263,7 @@ impl LayerManager {
        let desc = layer.layer_desc();
        if !layer.is_remote_layer() {
            layer.delete_resident_layer_file()?;
-            metrics.resident_physical_size_gauge.sub(desc.file_size);
+            metrics.resident_physical_size_sub(desc.file_size);
        }

        // TODO Removing from the bottom of the layer map is expensive.
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -370,8 +370,9 @@ pub(super) async fn handle_walreceiver_connection(
            })?;

        if let Some(last_lsn) = status_update {
-            let timeline_remote_consistent_lsn =
-                timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
+            let timeline_remote_consistent_lsn = timeline
+                .get_remote_consistent_lsn_visible()
+                .unwrap_or(Lsn(0));

            // The last LSN we processed. It is not guaranteed to survive pageserver crash.
            let last_received_lsn = last_lsn;
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -1,5 +1,3 @@
-use crate::metrics::RemoteOpFileKind;
-
 use super::storage_layer::LayerFileName;
 use super::Generation;
 use crate::tenant::metadata::TimelineMetadata;
@@ -11,6 +9,7 @@ use std::fmt::Debug;
 use chrono::NaiveDateTime;
 use std::sync::Arc;
 use tracing::info;
+use utils::lsn::AtomicLsn;

 use std::sync::atomic::AtomicU32;
 use utils::lsn::Lsn;
@@ -58,7 +57,12 @@ pub(crate) struct UploadQueueInitialized {
    /// uploaded. `Lsn(0)` if nothing was uploaded yet.
    /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
    /// Safekeeper can rely on it to make decisions for WAL storage.
-    pub(crate) last_uploaded_consistent_lsn: Lsn,
+    ///
+    /// visible_remote_consistent_lsn is only updated after our generation has been validated with
+    /// the control plane (unlesss a timeline's generation is None, in which case
+    /// we skip validation)
+    pub(crate) projected_remote_consistent_lsn: Option<Lsn>,
+    pub(crate) visible_remote_consistent_lsn: Arc<AtomicLsn>,

    // Breakdown of different kinds of tasks currently in-progress
    pub(crate) num_inprogress_layer_uploads: usize,
@@ -81,6 +85,14 @@ impl UploadQueueInitialized {
    pub(super) fn no_pending_work(&self) -> bool {
        self.inprogress_tasks.is_empty() && self.queued_operations.is_empty()
    }
+
+    pub(super) fn get_last_remote_consistent_lsn_visible(&self) -> Lsn {
+        self.visible_remote_consistent_lsn.load()
+    }
+
+    pub(super) fn get_last_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
+        self.projected_remote_consistent_lsn
+    }
 }

 #[derive(Clone, Copy)]
@@ -114,9 +126,8 @@ impl UploadQueue {
            latest_files: HashMap::new(),
            latest_files_changes_since_metadata_upload_scheduled: 0,
            latest_metadata: metadata.clone(),
-            // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
-            // safekeepers from garbage-collecting anything.
-            last_uploaded_consistent_lsn: Lsn(0),
+            projected_remote_consistent_lsn: None,
+            visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
            // what follows are boring default initializations
            task_counter: 0,
            num_inprogress_layer_uploads: 0,
@@ -158,7 +169,10 @@ impl UploadQueue {
            latest_files: files,
            latest_files_changes_since_metadata_upload_scheduled: 0,
            latest_metadata: index_part.metadata.clone(),
-            last_uploaded_consistent_lsn: index_part.metadata.disk_consistent_lsn(),
+            projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()),
+            visible_remote_consistent_lsn: Arc::new(
+                index_part.metadata.disk_consistent_lsn().into(),
+            ),
            // what follows are boring default initializations
            task_counter: 0,
            num_inprogress_layer_uploads: 0,
@@ -201,12 +215,11 @@ pub(crate) struct UploadTask {
    pub(crate) op: UploadOp,
 }

+/// A deletion of some layers within the lifetime of a timeline.  This is not used
+/// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
 #[derive(Debug)]
 pub(crate) struct Delete {
-    pub(crate) file_kind: RemoteOpFileKind,
-    pub(crate) layer_file_name: LayerFileName,
-    pub(crate) scheduled_from_timeline_delete: bool,
-    pub(crate) generation: Generation,
+    pub(crate) layers: Vec<(LayerFileName, Generation)>,
 }

 #[derive(Debug)]
@@ -217,7 +230,7 @@ pub(crate) enum UploadOp {
    /// Upload the metadata file
    UploadMetadata(IndexPart, Lsn),

-    /// Delete a layer file
+    /// Delete layer files
    Delete(Delete),

    /// Barrier. When the barrier operation is reached,
@@ -239,13 +252,9 @@ impl std::fmt::Display for UploadOp {
            UploadOp::UploadMetadata(_, lsn) => {
                write!(f, "UploadMetadata(lsn: {})", lsn)
            }
-            UploadOp::Delete(delete) => write!(
-                f,
-                "Delete(path: {}, scheduled_from_timeline_delete: {}, gen: {:?})",
-                delete.layer_file_name.file_name(),
-                delete.scheduled_from_timeline_delete,
-                delete.generation
-            ),
+            UploadOp::Delete(delete) => {
+                write!(f, "Delete({} layers)", delete.layers.len(),)
+            }
            UploadOp::Barrier(_) => write!(f, "Barrier"),
        }
    }
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -650,6 +650,12 @@ mod tests {
        File(File),
    }

+    impl From<VirtualFile> for MaybeVirtualFile {
+        fn from(vf: VirtualFile) -> Self {
+            MaybeVirtualFile::VirtualFile(vf)
+        }
+    }
+
    impl MaybeVirtualFile {
        async fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<(), Error> {
            match self {
@@ -887,4 +893,54 @@ mod tests {

        Ok(())
    }
+
+    #[tokio::test]
+    async fn test_atomic_overwrite_basic() {
+        let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic");
+        std::fs::create_dir_all(&testdir).unwrap();
+
+        let path = testdir.join("myfile");
+        let tmp_path = testdir.join("myfile.tmp");
+
+        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo")
+            .await
+            .unwrap();
+        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
+        let post = file.read_string().await.unwrap();
+        assert_eq!(post, "foo");
+        assert!(!tmp_path.exists());
+        drop(file);
+
+        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar")
+            .await
+            .unwrap();
+        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
+        let post = file.read_string().await.unwrap();
+        assert_eq!(post, "bar");
+        assert!(!tmp_path.exists());
+        drop(file);
+    }
+
+    #[tokio::test]
+    async fn test_atomic_overwrite_preexisting_tmp() {
+        let testdir =
+            crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp");
+        std::fs::create_dir_all(&testdir).unwrap();
+
+        let path = testdir.join("myfile");
+        let tmp_path = testdir.join("myfile.tmp");
+
+        std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
+        assert!(tmp_path.exists());
+
+        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo")
+            .await
+            .unwrap();
+
+        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
+        let post = file.read_string().await.unwrap();
+        assert_eq!(post, "foo");
+        assert!(!tmp_path.exists());
+        drop(file);
+    }
 }
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -363,7 +363,7 @@ impl<'a> WalIngest<'a> {

        // Now that this record has been fully handled, including updating the
        // checkpoint data, let the repository know that it is up-to-date to this LSN
-        modification.commit().await?;
+        modification.commit(ctx).await?;

        Ok(())
    }
@@ -444,6 +444,7 @@ impl<'a> WalIngest<'a> {
        // need to clear the corresponding bits in the visibility map.
        let mut new_heap_blkno: Option<u32> = None;
        let mut old_heap_blkno: Option<u32> = None;
+        let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;

        match self.timeline.pg_version {
            14 => {
@@ -470,14 +471,20 @@ impl<'a> WalIngest<'a> {
                        // we can't validate the remaining number of bytes without parsing
                        // the tuple data.
                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
                        }
                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
                            // non-HOT update where the new tuple goes to different page than
                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
                            // set.
-                            new_heap_blkno = Some(decoded.blocks[1].blkno);
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_LOCK {
+                        let xlrec = v14::XlHeapLock::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
                        }
                    }
                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
@@ -497,6 +504,12 @@ impl<'a> WalIngest<'a> {
                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
                            new_heap_blkno = Some(decoded.blocks[0].blkno);
                        }
+                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
+                        let xlrec = v14::XlHeapLockUpdated::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
                    }
                } else {
                    bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
@@ -526,14 +539,20 @@ impl<'a> WalIngest<'a> {
                        // we can't validate the remaining number of bytes without parsing
                        // the tuple data.
                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
                        }
                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
                            // non-HOT update where the new tuple goes to different page than
                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
                            // set.
-                            new_heap_blkno = Some(decoded.blocks[1].blkno);
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_LOCK {
+                        let xlrec = v15::XlHeapLock::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
                        }
                    }
                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
@@ -553,6 +572,12 @@ impl<'a> WalIngest<'a> {
                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
                            new_heap_blkno = Some(decoded.blocks[0].blkno);
                        }
+                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
+                        let xlrec = v15::XlHeapLockUpdated::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
                    }
                } else {
                    bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
@@ -582,14 +607,20 @@ impl<'a> WalIngest<'a> {
                        // we can't validate the remaining number of bytes without parsing
                        // the tuple data.
                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
                        }
                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
                            // non-HOT update where the new tuple goes to different page than
                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
                            // set.
-                            new_heap_blkno = Some(decoded.blocks[1].blkno);
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_LOCK {
+                        let xlrec = v16::XlHeapLock::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
                        }
                    }
                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
@@ -609,6 +640,12 @@ impl<'a> WalIngest<'a> {
                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
                            new_heap_blkno = Some(decoded.blocks[0].blkno);
                        }
+                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
+                        let xlrec = v16::XlHeapLockUpdated::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
                    }
                } else {
                    bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
@@ -616,7 +653,6 @@ impl<'a> WalIngest<'a> {
            }
            _ => {}
        }
-        // FIXME: What about XLOG_HEAP_LOCK and XLOG_HEAP2_LOCK_UPDATED?

        // Clear the VM bits if required.
        if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
@@ -660,7 +696,7 @@ impl<'a> WalIngest<'a> {
                        NeonWalRecord::ClearVisibilityMapFlags {
                            new_heap_blkno,
                            old_heap_blkno,
-                            flags: pg_constants::VISIBILITYMAP_VALID_BITS,
+                            flags,
                        },
                        ctx,
                    )
@@ -676,7 +712,7 @@ impl<'a> WalIngest<'a> {
                            NeonWalRecord::ClearVisibilityMapFlags {
                                new_heap_blkno,
                                old_heap_blkno: None,
-                                flags: pg_constants::VISIBILITYMAP_VALID_BITS,
+                                flags,
                            },
                            ctx,
                        )
@@ -690,7 +726,7 @@ impl<'a> WalIngest<'a> {
                            NeonWalRecord::ClearVisibilityMapFlags {
                                new_heap_blkno: None,
                                old_heap_blkno,
-                                flags: pg_constants::VISIBILITYMAP_VALID_BITS,
+                                flags,
                            },
                            ctx,
                        )
@@ -717,6 +753,8 @@ impl<'a> WalIngest<'a> {
        // need to clear the corresponding bits in the visibility map.
        let mut new_heap_blkno: Option<u32> = None;
        let mut old_heap_blkno: Option<u32> = None;
+        let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
+
        assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);

        match self.timeline.pg_version {
@@ -745,14 +783,14 @@ impl<'a> WalIngest<'a> {
                        // we can't validate the remaining number of bytes without parsing
                        // the tuple data.
                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
                        }
                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
                            // non-HOT update where the new tuple goes to different page than
                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
                            // set.
-                            new_heap_blkno = Some(decoded.blocks[1].blkno);
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
                        }
                    }
                    pg_constants::XLOG_NEON_HEAP_MULTI_INSERT => {
@@ -772,7 +810,11 @@ impl<'a> WalIngest<'a> {
                        }
                    }
                    pg_constants::XLOG_NEON_HEAP_LOCK => {
-                        /* XLOG_NEON_HEAP_LOCK doesn't need special care */
+                        let xlrec = v16::rm_neon::XlNeonHeapLock::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
                    }
                    info => bail!("Unknown WAL record type for Neon RMGR: {}", info),
                }
@@ -783,8 +825,6 @@ impl<'a> WalIngest<'a> {
            ),
        }

-        // FIXME: What about XLOG_NEON_HEAP_LOCK?
-
        // Clear the VM bits if required.
        if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
            let vm_rel = RelTag {
@@ -827,7 +867,7 @@ impl<'a> WalIngest<'a> {
                        NeonWalRecord::ClearVisibilityMapFlags {
                            new_heap_blkno,
                            old_heap_blkno,
-                            flags: pg_constants::VISIBILITYMAP_VALID_BITS,
+                            flags,
                        },
                        ctx,
                    )
@@ -843,7 +883,7 @@ impl<'a> WalIngest<'a> {
                            NeonWalRecord::ClearVisibilityMapFlags {
                                new_heap_blkno,
                                old_heap_blkno: None,
-                                flags: pg_constants::VISIBILITYMAP_VALID_BITS,
+                                flags,
                            },
                            ctx,
                        )
@@ -857,7 +897,7 @@ impl<'a> WalIngest<'a> {
                            NeonWalRecord::ClearVisibilityMapFlags {
                                new_heap_blkno: None,
                                old_heap_blkno,
-                                flags: pg_constants::VISIBILITYMAP_VALID_BITS,
+                                flags,
                            },
                            ctx,
                        )
@@ -1521,7 +1561,7 @@ mod tests {
        let mut m = tline.begin_modification(Lsn(0x10));
        m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
        m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
-        m.commit().await?;
+        m.commit(ctx).await?;
        let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;

        Ok(walingest)
@@ -1540,22 +1580,22 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;
        let mut m = tline.begin_modification(Lsn(0x30));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;
        let mut m = tline.begin_modification(Lsn(0x40));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;
        let mut m = tline.begin_modification(Lsn(0x50));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;

        assert_current_logical_size(&tline, Lsn(0x50));

@@ -1641,7 +1681,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, 2, &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;
        assert_current_logical_size(&tline, Lsn(0x60));

        // Check reported size and contents after truncation
@@ -1683,7 +1723,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, 0, &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;
        assert_eq!(
            tline
                .get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
@@ -1696,7 +1736,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;
        assert_eq!(
            tline
                .get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
@@ -1721,7 +1761,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;
        assert_eq!(
            tline
                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
@@ -1760,7 +1800,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;

        // Check that rel exists and size is correct
        assert_eq!(
@@ -1779,7 +1819,7 @@ mod tests {
        // Drop rel
        let mut m = tline.begin_modification(Lsn(0x30));
        walingest.put_rel_drop(&mut m, TESTREL_A, &ctx).await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;

        // Check that rel is not visible anymore
        assert_eq!(
@@ -1797,7 +1837,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;

        // Check that rel exists and size is correct
        assert_eq!(
@@ -1836,7 +1876,7 @@ mod tests {
                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                .await?;
        }
-        m.commit().await?;
+        m.commit(&ctx).await?;

        // The relation was created at LSN 20, not visible at LSN 1 yet.
        assert_eq!(
@@ -1881,7 +1921,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, 1, &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;

        // Check reported size and contents after truncation
        assert_eq!(
@@ -1930,7 +1970,7 @@ mod tests {
                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                .await?;
        }
-        m.commit().await?;
+        m.commit(&ctx).await?;

        assert_eq!(
            tline
@@ -1977,7 +2017,7 @@ mod tests {
            walingest
                .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
                .await?;
-            m.commit().await?;
+            m.commit(&ctx).await?;
        }

        assert_current_logical_size(&tline, Lsn(lsn));
@@ -1993,7 +2033,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE, &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;
        assert_eq!(
            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
            RELSEG_SIZE
@@ -2006,7 +2046,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1, &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;
        assert_eq!(
            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
            RELSEG_SIZE - 1
@@ -2022,7 +2062,7 @@ mod tests {
            walingest
                .put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber, &ctx)
                .await?;
-            m.commit().await?;
+            m.commit(&ctx).await?;
            assert_eq!(
                tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
                size as BlockNumber
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -219,20 +219,66 @@ pub mod v14 {
                old_offnum: buf.get_u16_le(),
                old_infobits_set: buf.get_u8(),
                flags: buf.get_u8(),
-                t_cid: buf.get_u32(),
+                t_cid: buf.get_u32_le(),
                new_xmax: buf.get_u32_le(),
                new_offnum: buf.get_u16_le(),
            }
        }
    }
+
+    #[repr(C)]
+    #[derive(Debug)]
+    pub struct XlHeapLock {
+        pub locking_xid: TransactionId,
+        pub offnum: OffsetNumber,
+        pub _padding: u16,
+        pub t_cid: u32,
+        pub infobits_set: u8,
+        pub flags: u8,
+    }
+
+    impl XlHeapLock {
+        pub fn decode(buf: &mut Bytes) -> XlHeapLock {
+            XlHeapLock {
+                locking_xid: buf.get_u32_le(),
+                offnum: buf.get_u16_le(),
+                _padding: buf.get_u16_le(),
+                t_cid: buf.get_u32_le(),
+                infobits_set: buf.get_u8(),
+                flags: buf.get_u8(),
+            }
+        }
+    }
+
+    #[repr(C)]
+    #[derive(Debug)]
+    pub struct XlHeapLockUpdated {
+        pub xmax: TransactionId,
+        pub offnum: OffsetNumber,
+        pub infobits_set: u8,
+        pub flags: u8,
+    }
+
+    impl XlHeapLockUpdated {
+        pub fn decode(buf: &mut Bytes) -> XlHeapLockUpdated {
+            XlHeapLockUpdated {
+                xmax: buf.get_u32_le(),
+                offnum: buf.get_u16_le(),
+                infobits_set: buf.get_u8(),
+                flags: buf.get_u8(),
+            }
+        }
+    }
 }

 pub mod v15 {
-    pub use super::v14::{XlHeapDelete, XlHeapInsert, XlHeapMultiInsert, XlHeapUpdate};
+    pub use super::v14::{
+        XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapLockUpdated, XlHeapMultiInsert, XlHeapUpdate,
+    };
 }

 pub mod v16 {
-    pub use super::v14::{XlHeapInsert, XlHeapMultiInsert};
+    pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert};
    use bytes::{Buf, Bytes};
    use postgres_ffi::{OffsetNumber, TransactionId};

@@ -278,6 +324,26 @@ pub mod v16 {
        }
    }

+    #[repr(C)]
+    #[derive(Debug)]
+    pub struct XlHeapLock {
+        pub locking_xid: TransactionId,
+        pub offnum: OffsetNumber,
+        pub infobits_set: u8,
+        pub flags: u8,
+    }
+
+    impl XlHeapLock {
+        pub fn decode(buf: &mut Bytes) -> XlHeapLock {
+            XlHeapLock {
+                locking_xid: buf.get_u32_le(),
+                offnum: buf.get_u16_le(),
+                infobits_set: buf.get_u8(),
+                flags: buf.get_u8(),
+            }
+        }
+    }
+
    /* Since PG16, we have the Neon RMGR (RM_NEON_ID) to manage Neon-flavored WAL. */
    pub mod rm_neon {
        use bytes::{Buf, Bytes};
@@ -366,6 +432,28 @@ pub mod v16 {
                }
            }
        }
+
+        #[repr(C)]
+        #[derive(Debug)]
+        pub struct XlNeonHeapLock {
+            pub locking_xid: TransactionId,
+            pub t_cid: u32,
+            pub offnum: OffsetNumber,
+            pub infobits_set: u8,
+            pub flags: u8,
+        }
+
+        impl XlNeonHeapLock {
+            pub fn decode(buf: &mut Bytes) -> XlNeonHeapLock {
+                XlNeonHeapLock {
+                    locking_xid: buf.get_u32_le(),
+                    t_cid: buf.get_u32_le(),
+                    offnum: buf.get_u16_le(),
+                    infobits_set: buf.get_u8(),
+                    flags: buf.get_u8(),
+                }
+            }
+        }
    }
 }

--- a/pgxn/hnsw/hnsw.control
+++ b/pgxn/hnsw/hnsw.control
@@ -2,4 +2,3 @@ comment = '** Deprecated ** Please use pg_embedding instead'
 default_version = '0.1.0'
 module_pathname = '$libdir/hnsw'
 relocatable = true
-trusted = true
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -7,12 +7,12 @@ OBJS = \
 	extension_server.o \
 	file_cache.o \
 	libpagestore.o \
-	libpqwalproposer.o \
 	neon.o \
+	neon_utils.o \
 	pagestore_smgr.o \
 	relsize_cache.o \
 	walproposer.o \
-	walproposer_utils.o \
+	walproposer_pg.o \
 	control_plane_connector.o

 PG_CPPFLAGS = -I$(libpq_srcdir)
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -153,7 +153,7 @@ lfc_ensure_opened(void)
 			return false;
 		}
 	}
-	return false;
+	return true;
 }

 static void
@@ -222,8 +222,9 @@ lfc_change_limit_hook(int newval, void *extra)
 	/*
 	 * Stats collector detach shared memory, so we should not try to access shared memory here.
 	 * Parallel workers first assign default value (0), so not perform truncation in parallel workers.
+	 * The Postmaster can handle SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL), but has no PGPROC.
 	 */
-	if (!lfc_ctl || !UsedShmemSegAddr || IsParallelWorker())
+	if (!lfc_ctl || !MyProc || !UsedShmemSegAddr || IsParallelWorker())
 		return;

 	/* Open cache file if not done yet */
@@ -640,6 +641,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
 	if (rc != BLCKSZ)
 	{
+		LWLockRelease(lfc_lock);
 		lfc_disable("write");
 	}
 	else
@@ -650,9 +652,8 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);

 		entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31));
+		LWLockRelease(lfc_lock);
 	}
-
-	LWLockRelease(lfc_lock);
 }

 /*
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -30,7 +30,7 @@

 #include "neon.h"
 #include "walproposer.h"
-#include "walproposer_utils.h"
+#include "neon_utils.h"

 #define PageStoreTrace DEBUG5

--- a/pgxn/neon/libpqwalproposer.c
+++ b/pgxn/neon/libpqwalproposer.c
@@ -1,424 +0,0 @@
-#include "postgres.h"
-
-#include "libpq-fe.h"
-#include "neon.h"
-#include "walproposer.h"
-
-/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
-struct WalProposerConn
-{
-	PGconn	   *pg_conn;
-	bool		is_nonblocking; /* whether the connection is non-blocking */
-	char	   *recvbuf;		/* last received data from
-								 * walprop_async_read */
-};
-
-/* Helper function */
-static bool
-ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
-{
-	/* If we're already correctly blocking or nonblocking, all good */
-	if (is_nonblocking == conn->is_nonblocking)
-		return true;
-
-	/* Otherwise, set it appropriately */
-	if (PQsetnonblocking(conn->pg_conn, is_nonblocking) == -1)
-		return false;
-
-	conn->is_nonblocking = is_nonblocking;
-	return true;
-}
-
-/* Exported function definitions */
-char *
-walprop_error_message(WalProposerConn *conn)
-{
-	return PQerrorMessage(conn->pg_conn);
-}
-
-WalProposerConnStatusType
-walprop_status(WalProposerConn *conn)
-{
-	switch (PQstatus(conn->pg_conn))
-	{
-		case CONNECTION_OK:
-			return WP_CONNECTION_OK;
-		case CONNECTION_BAD:
-			return WP_CONNECTION_BAD;
-		default:
-			return WP_CONNECTION_IN_PROGRESS;
-	}
-}
-
-WalProposerConn *
-walprop_connect_start(char *conninfo, char *password)
-{
-	WalProposerConn *conn;
-	PGconn	   *pg_conn;
-	const char *keywords[3];
-	const char *values[3];
-	int			n;
-
-	/*
-	 * Connect using the given connection string. If the
-	 * NEON_AUTH_TOKEN environment variable was set, use that as
-	 * the password.
-	 *
-	 * The connection options are parsed in the order they're given, so
-	 * when we set the password before the connection string, the
-	 * connection string can override the password from the env variable.
-	 * Seems useful, although we don't currently use that capability
-	 * anywhere.
-	 */
-	n = 0;
-	if (password)
-	{
-		keywords[n] = "password";
-		values[n] = password;
-		n++;
-	}
-	keywords[n] = "dbname";
-	values[n] = conninfo;
-	n++;
-	keywords[n] = NULL;
-	values[n] = NULL;
-	n++;
-	pg_conn = PQconnectStartParams(keywords, values, 1);
-
-	/*
-	 * Allocation of a PQconn can fail, and will return NULL. We want to fully
-	 * replicate the behavior of PQconnectStart here.
-	 */
-	if (!pg_conn)
-		return NULL;
-
-	/*
-	 * And in theory this allocation can fail as well, but it's incredibly
-	 * unlikely if we just successfully allocated a PGconn.
-	 *
-	 * palloc will exit on failure though, so there's not much we could do if
-	 * it *did* fail.
-	 */
-	conn = palloc(sizeof(WalProposerConn));
-	conn->pg_conn = pg_conn;
-	conn->is_nonblocking = false;	/* connections always start in blocking
-									 * mode */
-	conn->recvbuf = NULL;
-	return conn;
-}
-
-WalProposerConnectPollStatusType
-walprop_connect_poll(WalProposerConn *conn)
-{
-	WalProposerConnectPollStatusType return_val;
-
-	switch (PQconnectPoll(conn->pg_conn))
-	{
-		case PGRES_POLLING_FAILED:
-			return_val = WP_CONN_POLLING_FAILED;
-			break;
-		case PGRES_POLLING_READING:
-			return_val = WP_CONN_POLLING_READING;
-			break;
-		case PGRES_POLLING_WRITING:
-			return_val = WP_CONN_POLLING_WRITING;
-			break;
-		case PGRES_POLLING_OK:
-			return_val = WP_CONN_POLLING_OK;
-			break;
-
-			/*
-			 * There's a comment at its source about this constant being
-			 * unused. We'll expect it's never returned.
-			 */
-		case PGRES_POLLING_ACTIVE:
-			elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
-
-			/*
-			 * This return is never actually reached, but it's here to make
-			 * the compiler happy
-			 */
-			return WP_CONN_POLLING_FAILED;
-
-		default:
-			Assert(false);
-			return_val = WP_CONN_POLLING_FAILED;	/* keep the compiler quiet */
-	}
-
-	return return_val;
-}
-
-bool
-walprop_send_query(WalProposerConn *conn, char *query)
-{
-	/*
-	 * We need to be in blocking mode for sending the query to run without
-	 * requiring a call to PQflush
-	 */
-	if (!ensure_nonblocking_status(conn, false))
-		return false;
-
-	/* PQsendQuery returns 1 on success, 0 on failure */
-	if (!PQsendQuery(conn->pg_conn, query))
-		return false;
-
-	return true;
-}
-
-WalProposerExecStatusType
-walprop_get_query_result(WalProposerConn *conn)
-{
-	PGresult   *result;
-	WalProposerExecStatusType return_val;
-
-	/* Marker variable if we need to log an unexpected success result */
-	char	   *unexpected_success = NULL;
-
-	/* Consume any input that we might be missing */
-	if (!PQconsumeInput(conn->pg_conn))
-		return WP_EXEC_FAILED;
-
-	if (PQisBusy(conn->pg_conn))
-		return WP_EXEC_NEEDS_INPUT;
-
-
-	result = PQgetResult(conn->pg_conn);
-
-	/*
-	 * PQgetResult returns NULL only if getting the result was successful &
-	 * there's no more of the result to get.
-	 */
-	if (!result)
-	{
-		elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
-		return WP_EXEC_UNEXPECTED_SUCCESS;
-	}
-
-	/* Helper macro to reduce boilerplate */
-#define UNEXPECTED_SUCCESS(msg) \
-		return_val = WP_EXEC_UNEXPECTED_SUCCESS; \
-		unexpected_success = msg; \
-		break;
-
-
-	switch (PQresultStatus(result))
-	{
-			/* "true" success case */
-		case PGRES_COPY_BOTH:
-			return_val = WP_EXEC_SUCCESS_COPYBOTH;
-			break;
-
-			/* Unexpected success case */
-		case PGRES_EMPTY_QUERY:
-			UNEXPECTED_SUCCESS("empty query return");
-		case PGRES_COMMAND_OK:
-			UNEXPECTED_SUCCESS("data-less command end");
-		case PGRES_TUPLES_OK:
-			UNEXPECTED_SUCCESS("tuples return");
-		case PGRES_COPY_OUT:
-			UNEXPECTED_SUCCESS("'Copy Out' response");
-		case PGRES_COPY_IN:
-			UNEXPECTED_SUCCESS("'Copy In' response");
-		case PGRES_SINGLE_TUPLE:
-			UNEXPECTED_SUCCESS("single tuple return");
-		case PGRES_PIPELINE_SYNC:
-			UNEXPECTED_SUCCESS("pipeline sync point");
-
-			/* Failure cases */
-		case PGRES_BAD_RESPONSE:
-		case PGRES_NONFATAL_ERROR:
-		case PGRES_FATAL_ERROR:
-		case PGRES_PIPELINE_ABORTED:
-			return_val = WP_EXEC_FAILED;
-			break;
-
-		default:
-			Assert(false);
-			return_val = WP_EXEC_FAILED;	/* keep the compiler quiet */
-	}
-
-	if (unexpected_success)
-		elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
-
-	return return_val;
-}
-
-pgsocket
-walprop_socket(WalProposerConn *conn)
-{
-	return PQsocket(conn->pg_conn);
-}
-
-int
-walprop_flush(WalProposerConn *conn)
-{
-	return (PQflush(conn->pg_conn));
-}
-
-void
-walprop_finish(WalProposerConn *conn)
-{
-	if (conn->recvbuf != NULL)
-		PQfreemem(conn->recvbuf);
-	PQfinish(conn->pg_conn);
-	pfree(conn);
-}
-
-/*
- * Receive a message from the safekeeper.
- *
- * On success, the data is placed in *buf. It is valid until the next call
- * to this function.
- */
-PGAsyncReadResult
-walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
-{
-	int			result;
-
-	if (conn->recvbuf != NULL)
-	{
-		PQfreemem(conn->recvbuf);
-		conn->recvbuf = NULL;
-	}
-
-	/* Call PQconsumeInput so that we have the data we need */
-	if (!PQconsumeInput(conn->pg_conn))
-	{
-		*amount = 0;
-		*buf = NULL;
-		return PG_ASYNC_READ_FAIL;
-	}
-
-	/*
-	 * The docs for PQgetCopyData list the return values as: 0 if the copy is
-	 * still in progress, but no "complete row" is available -1 if the copy is
-	 * done -2 if an error occurred (> 0) if it was successful; that value is
-	 * the amount transferred.
-	 *
-	 * The protocol we use between walproposer and safekeeper means that we
-	 * *usually* wouldn't expect to see that the copy is done, but this can
-	 * sometimes be triggered by the server returning an ErrorResponse (which
-	 * also happens to have the effect that the copy is done).
-	 */
-	switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true))
-	{
-		case 0:
-			*amount = 0;
-			*buf = NULL;
-			return PG_ASYNC_READ_TRY_AGAIN;
-		case -1:
-			{
-				/*
-				 * If we get -1, it's probably because of a server error; the
-				 * safekeeper won't normally send a CopyDone message.
-				 *
-				 * We can check PQgetResult to make sure that the server
-				 * failed; it'll always result in PGRES_FATAL_ERROR
-				 */
-				ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
-
-				if (status != PGRES_FATAL_ERROR)
-					elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
-
-				/*
-				 * If there was actually an error, it'll be properly reported
-				 * by calls to PQerrorMessage -- we don't have to do anything
-				 * else
-				 */
-				*amount = 0;
-				*buf = NULL;
-				return PG_ASYNC_READ_FAIL;
-			}
-		case -2:
-			*amount = 0;
-			*buf = NULL;
-			return PG_ASYNC_READ_FAIL;
-		default:
-			/* Positive values indicate the size of the returned result */
-			*amount = result;
-			*buf = conn->recvbuf;
-			return PG_ASYNC_READ_SUCCESS;
-	}
-}
-
-PGAsyncWriteResult
-walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
-{
-	int			result;
-
-	/* If we aren't in non-blocking mode, switch to it. */
-	if (!ensure_nonblocking_status(conn, true))
-		return PG_ASYNC_WRITE_FAIL;
-
-	/*
-	 * The docs for PQputcopyData list the return values as: 1 if the data was
-	 * queued, 0 if it was not queued because of full buffers, or -1 if an
-	 * error occurred
-	 */
-	result = PQputCopyData(conn->pg_conn, buf, size);
-
-	/*
-	 * We won't get a result of zero because walproposer always empties the
-	 * connection's buffers before sending more
-	 */
-	Assert(result != 0);
-
-	switch (result)
-	{
-		case 1:
-			/* good -- continue */
-			break;
-		case -1:
-			return PG_ASYNC_WRITE_FAIL;
-		default:
-			elog(FATAL, "invalid return %d from PQputCopyData", result);
-	}
-
-	/*
-	 * After queueing the data, we still need to flush to get it to send. This
-	 * might take multiple tries, but we don't want to wait around until it's
-	 * done.
-	 *
-	 * PQflush has the following returns (directly quoting the docs): 0 if
-	 * sucessful, 1 if it was unable to send all the data in the send queue
-	 * yet -1 if it failed for some reason
-	 */
-	switch (result = PQflush(conn->pg_conn))
-	{
-		case 0:
-			return PG_ASYNC_WRITE_SUCCESS;
-		case 1:
-			return PG_ASYNC_WRITE_TRY_FLUSH;
-		case -1:
-			return PG_ASYNC_WRITE_FAIL;
-		default:
-			elog(FATAL, "invalid return %d from PQflush", result);
-	}
-}
-
-/*
- * This function is very similar to walprop_async_write. For more
- * information, refer to the comments there.
- */
-bool
-walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size)
-{
-	int			result;
-
-	/* If we are in non-blocking mode, switch out of it. */
-	if (!ensure_nonblocking_status(conn, false))
-		return false;
-
-	if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1)
-		return false;
-
-	Assert(result == 1);
-
-	/* Because the connection is non-blocking, flushing returns 0 or -1 */
-
-	if ((result = PQflush(conn->pg_conn)) == -1)
-		return false;
-
-	Assert(result == 0);
-	return true;
-}
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -18,6 +18,10 @@ extern char *neon_auth_token;
 extern char *neon_timeline;
 extern char *neon_tenant;

+extern char *wal_acceptors_list;
+extern int	wal_acceptor_reconnect_timeout;
+extern int	wal_acceptor_connection_timeout;
+
 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);

@@ -30,4 +34,10 @@ extern void pg_init_extension_server(void);
 extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
 extern bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);

+extern uint64 BackpressureThrottlingTime(void);
+extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
+
+extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]);
+extern void PGDLLEXPORT WalProposerMain(Datum main_arg);
+
 #endif							/* NEON_H */
--- a/pgxn/neon/neon_utils.c
+++ b/pgxn/neon/neon_utils.c
@@ -0,0 +1,116 @@
+#include "postgres.h"
+
+#include "access/timeline.h"
+#include "access/xlogutils.h"
+#include "common/logging.h"
+#include "common/ip.h"
+#include "funcapi.h"
+#include "libpq/libpq.h"
+#include "libpq/pqformat.h"
+#include "miscadmin.h"
+#include "postmaster/interrupt.h"
+#include "replication/slot.h"
+#include "replication/walsender_private.h"
+
+#include "storage/ipc.h"
+#include "utils/builtins.h"
+#include "utils/ps_status.h"
+
+#include "libpq-fe.h"
+#include <netinet/tcp.h>
+#include <unistd.h>
+
+#if PG_VERSION_NUM >= 150000
+#include "access/xlogutils.h"
+#include "access/xlogrecovery.h"
+#endif
+#if PG_MAJORVERSION_NUM >= 16
+#include "utils/guc.h"
+#endif
+
+/*
+ * Convert a character which represents a hexadecimal digit to an integer.
+ *
+ * Returns -1 if the character is not a hexadecimal digit.
+ */
+int
+HexDecodeChar(char c)
+{
+	if (c >= '0' && c <= '9')
+		return c - '0';
+	if (c >= 'a' && c <= 'f')
+		return c - 'a' + 10;
+	if (c >= 'A' && c <= 'F')
+		return c - 'A' + 10;
+
+	return -1;
+}
+
+/*
+ * Decode a hex string into a byte string, 2 hex chars per byte.
+ *
+ * Returns false if invalid characters are encountered; otherwise true.
+ */
+bool
+HexDecodeString(uint8 *result, char *input, int nbytes)
+{
+	int			i;
+
+	for (i = 0; i < nbytes; ++i)
+	{
+		int			n1 = HexDecodeChar(input[i * 2]);
+		int			n2 = HexDecodeChar(input[i * 2 + 1]);
+
+		if (n1 < 0 || n2 < 0)
+			return false;
+		result[i] = n1 * 16 + n2;
+	}
+
+	return true;
+}
+
+/* --------------------------------
+ *		pq_getmsgint32_le	- get a binary 4-byte int from a message buffer in native (LE) order
+ * --------------------------------
+ */
+uint32
+pq_getmsgint32_le(StringInfo msg)
+{
+	uint32		n32;
+
+	pq_copymsgbytes(msg, (char *) &n32, sizeof(n32));
+
+	return n32;
+}
+
+/* --------------------------------
+ *		pq_getmsgint64	- get a binary 8-byte int from a message buffer in native (LE) order
+ * --------------------------------
+ */
+uint64
+pq_getmsgint64_le(StringInfo msg)
+{
+	uint64		n64;
+
+	pq_copymsgbytes(msg, (char *) &n64, sizeof(n64));
+
+	return n64;
+}
+
+/* append a binary [u]int32 to a StringInfo buffer in native (LE) order */
+void
+pq_sendint32_le(StringInfo buf, uint32 i)
+{
+	enlargeStringInfo(buf, sizeof(uint32));
+	memcpy(buf->data + buf->len, &i, sizeof(uint32));
+	buf->len += sizeof(uint32);
+}
+
+/* append a binary [u]int64 to a StringInfo buffer in native (LE) order */
+void
+pq_sendint64_le(StringInfo buf, uint64 i)
+{
+	enlargeStringInfo(buf, sizeof(uint64));
+	memcpy(buf->data + buf->len, &i, sizeof(uint64));
+	buf->len += sizeof(uint64);
+}
--- a/pgxn/neon/neon_utils.h
+++ b/pgxn/neon/neon_utils.h
@@ -0,0 +1,12 @@
+#ifndef __NEON_UTILS_H__
+#define __NEON_UTILS_H__
+
+#include "postgres.h"
+
+bool		HexDecodeString(uint8 *result, char *input, int nbytes);
+uint32		pq_getmsgint32_le(StringInfo msg);
+uint64		pq_getmsgint64_le(StringInfo msg);
+void		pq_sendint32_le(StringInfo buf, uint32 i);
+void		pq_sendint64_le(StringInfo buf, uint64 i);
+
+#endif							/* __NEON_UTILS_H__ */
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1790,6 +1790,14 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	if (!XLogInsertAllowed())
 		return;

+	/* ensure we have enough xlog buffers to log max-sized records */
+	XLogEnsureRecordSpace(Min(remblocks, (XLR_MAX_BLOCK_ID - 1)), 0);
+
+	/*
+	 * Iterate over all the pages. They are collected into batches of
+	 * XLR_MAX_BLOCK_ID pages, and a single WAL-record is written for each
+	 * batch.
+	 */
 	while (remblocks > 0)
 	{
 		int			count = Min(remblocks, XLR_MAX_BLOCK_ID);
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -1,8 +1,8 @@
 #ifndef __NEON_WALPROPOSER_H__
 #define __NEON_WALPROPOSER_H__

-#include "access/xlogdefs.h"
 #include "postgres.h"
+#include "access/xlogdefs.h"
 #include "port.h"
 #include "access/xlog_internal.h"
 #include "access/transam.h"
@@ -16,29 +16,15 @@
 #define MAX_SAFEKEEPERS 32
 #define MAX_SEND_SIZE (XLOG_BLCKSZ * 16)	/* max size of a single* WAL
 											 * message */
-#define XLOG_HDR_SIZE (1 + 8 * 3)	/* 'w' + startPos + walEnd + timestamp */
-#define XLOG_HDR_START_POS 1	/* offset of start position in wal sender*
-								 * message header */
-#define XLOG_HDR_END_POS (1 + 8)	/* offset of end position in wal sender*
-									 * message header */
-
 /*
 * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occurred,
 * because all WL_* events are given flags equal to some (1 << i), starting from i = 0
 */
 #define WL_NO_EVENTS 0

-extern char *wal_acceptors_list;
-extern int	wal_acceptor_reconnect_timeout;
-extern int	wal_acceptor_connection_timeout;
-extern bool am_wal_proposer;
-
-struct WalProposerConn;			/* Defined in libpqwalproposer */
+struct WalProposerConn;			/* Defined in implementation (walprop_pg.c) */
 typedef struct WalProposerConn WalProposerConn;

-struct WalMessage;
-typedef struct WalMessage WalMessage;
-
 /* Possible return values from ReadPGAsync */
 typedef enum
 {
@@ -52,7 +38,7 @@ typedef enum
 	PG_ASYNC_READ_TRY_AGAIN,
 	/* Reading failed. Check PQerrorMessage(conn) */
 	PG_ASYNC_READ_FAIL,
-}			PGAsyncReadResult;
+} PGAsyncReadResult;

 /* Possible return values from WritePGAsync */
 typedef enum
@@ -71,7 +57,7 @@ typedef enum
 	PG_ASYNC_WRITE_TRY_FLUSH,
 	/* Writing failed. Check PQerrorMessage(conn) */
 	PG_ASYNC_WRITE_FAIL,
-}			PGAsyncWriteResult;
+} PGAsyncWriteResult;

 /*
 * WAL safekeeper state, which is used to wait for some event.
@@ -147,7 +133,7 @@ typedef enum
 	 * to read.
 	 */
 	SS_ACTIVE,
-}			SafekeeperState;
+} SafekeeperState;

 /* Consensus logical timestamp. */
 typedef uint64 term_t;
@@ -171,12 +157,12 @@ typedef struct ProposerGreeting
 	uint8		tenant_id[16];
 	TimeLineID	timeline;
 	uint32		walSegSize;
-}			ProposerGreeting;
+} ProposerGreeting;

 typedef struct AcceptorProposerMessage
 {
 	uint64		tag;
-}			AcceptorProposerMessage;
+} AcceptorProposerMessage;

 /*
 * Acceptor -> Proposer initial response: the highest term acceptor voted for.
@@ -186,7 +172,7 @@ typedef struct AcceptorGreeting
 	AcceptorProposerMessage apm;
 	term_t		term;
 	NNodeId		nodeId;
-}			AcceptorGreeting;
+} AcceptorGreeting;

 /*
 * Proposer -> Acceptor vote request.
@@ -196,20 +182,20 @@ typedef struct VoteRequest
 	uint64		tag;
 	term_t		term;
 	pg_uuid_t	proposerId;		/* for monitoring/debugging */
-}			VoteRequest;
+} VoteRequest;

 /* Element of term switching chain. */
 typedef struct TermSwitchEntry
 {
 	term_t		term;
 	XLogRecPtr	lsn;
-}			TermSwitchEntry;
+} TermSwitchEntry;

 typedef struct TermHistory
 {
 	uint32		n_entries;
 	TermSwitchEntry *entries;
-}			TermHistory;
+} TermHistory;

 /* Vote itself, sent from safekeeper to proposer */
 typedef struct VoteResponse
@@ -227,7 +213,7 @@ typedef struct VoteResponse
 								 * recovery of some safekeeper */
 	TermHistory termHistory;
 	XLogRecPtr	timelineStartLsn;	/* timeline globally starts at this LSN */
-}			VoteResponse;
+} VoteResponse;

 /*
 * Proposer -> Acceptor message announcing proposer is elected and communicating
@@ -243,7 +229,7 @@ typedef struct ProposerElected
 	TermHistory *termHistory;
 	/* timeline globally starts at this LSN */
 	XLogRecPtr	timelineStartLsn;
-}			ProposerElected;
+} ProposerElected;

 /*
 * Header of request with WAL message sent from proposer to safekeeper.
@@ -268,7 +254,7 @@ typedef struct AppendRequestHeader
 	 */
 	XLogRecPtr	truncateLsn;
 	pg_uuid_t	proposerId;		/* for monitoring/debugging */
-}			AppendRequestHeader;
+} AppendRequestHeader;

 /*
 * Hot standby feedback received from replica
@@ -278,7 +264,7 @@ typedef struct HotStandbyFeedback
 	TimestampTz ts;
 	FullTransactionId xmin;
 	FullTransactionId catalog_xmin;
-}			HotStandbyFeedback;
+} HotStandbyFeedback;

 typedef struct PageserverFeedback
 {
@@ -289,7 +275,7 @@ typedef struct PageserverFeedback
 	XLogRecPtr	disk_consistent_lsn;
 	XLogRecPtr	remote_consistent_lsn;
 	TimestampTz replytime;
-}			PageserverFeedback;
+} PageserverFeedback;

 typedef struct WalproposerShmemState
 {
@@ -297,7 +283,7 @@ typedef struct WalproposerShmemState
 	PageserverFeedback feedback;
 	term_t		mineLastElectedTerm;
 	pg_atomic_uint64 backpressureThrottlingTime;
-}			WalproposerShmemState;
+} WalproposerShmemState;

 /*
 * Report safekeeper state to proposer
@@ -321,17 +307,22 @@ typedef struct AppendResponse
 	/* and custom neon feedback. */
 	/* This part of the message is extensible. */
 	PageserverFeedback rf;
-}			AppendResponse;
+} AppendResponse;

 /*  PageserverFeedback is extensible part of the message that is parsed separately */
 /*  Other fields are fixed part */
 #define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf)

+struct WalProposer;
+typedef struct WalProposer WalProposer;
+
 /*
 * Descriptor of safekeeper
 */
 typedef struct Safekeeper
 {
+	WalProposer *wp;
+
 	char const *host;
 	char const *port;

@@ -340,7 +331,7 @@ typedef struct Safekeeper
 	 *
 	 * May contain private information like password and should not be logged.
 	 */
-	char conninfo[MAXCONNINFO];
+	char		conninfo[MAXCONNINFO];

 	/*
 	 * postgres protocol connection to the WAL acceptor
@@ -373,27 +364,12 @@ typedef struct Safekeeper
 	int			eventPos;		/* position in wait event set. Equal to -1 if*
 								 * no event */
 	SafekeeperState state;		/* safekeeper state machine state */
-	TimestampTz latestMsgReceivedAt;        /* when latest msg is received */
+	TimestampTz latestMsgReceivedAt;	/* when latest msg is received */
 	AcceptorGreeting greetResponse; /* acceptor greeting */
 	VoteResponse voteResponse;	/* the vote */
 	AppendResponse appendResponse;	/* feedback for master */
 } Safekeeper;

-extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]);
-extern void PGDLLEXPORT WalProposerMain(Datum main_arg);
-extern void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
-extern void WalProposerPoll(void);
-extern void ParsePageserverFeedbackMessage(StringInfo reply_message,
-											PageserverFeedback *rf);
-extern void StartProposerReplication(StartReplicationCmd *cmd);
-
-extern Size WalproposerShmemSize(void);
-extern bool WalproposerShmemInit(void);
-extern void replication_feedback_set(PageserverFeedback *rf);
-extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
-
-/* libpqwalproposer hooks & helper type */
-
 /* Re-exported PostgresPollingStatusType */
 typedef enum
 {
@@ -406,7 +382,7 @@ typedef enum
 	 * 'libpq-fe.h' still has PGRES_POLLING_ACTIVE, but says it's unused.
 	 * We've removed it here to avoid clutter.
 	 */
-}			WalProposerConnectPollStatusType;
+} WalProposerConnectPollStatusType;

 /* Re-exported and modified ExecStatusType */
 typedef enum
@@ -431,7 +407,7 @@ typedef enum
 	WP_EXEC_NEEDS_INPUT,
 	/* Catch-all failure. Check PQerrorMessage. */
 	WP_EXEC_FAILED,
-}			WalProposerExecStatusType;
+} WalProposerExecStatusType;

 /* Re-exported ConnStatusType */
 typedef enum
@@ -445,67 +421,252 @@ typedef enum
 	 * that extra functionality, so we collect them into a single tag here.
 	 */
 	WP_CONNECTION_IN_PROGRESS,
-}			WalProposerConnStatusType;
-
-/* Re-exported PQerrorMessage */
-extern char *walprop_error_message(WalProposerConn *conn);
-
-/* Re-exported PQstatus */
-extern WalProposerConnStatusType walprop_status(WalProposerConn *conn);
-
-/* Re-exported PQconnectStart */
-extern WalProposerConn * walprop_connect_start(char *conninfo, char *password);
-
-/* Re-exported PQconectPoll */
-extern WalProposerConnectPollStatusType walprop_connect_poll(WalProposerConn *conn);
-
-/* Blocking wrapper around PQsendQuery */
-extern bool walprop_send_query(WalProposerConn *conn, char *query);
-
-/* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */
-extern WalProposerExecStatusType walprop_get_query_result(WalProposerConn *conn);
-
-/* Re-exported PQsocket */
-extern pgsocket walprop_socket(WalProposerConn *conn);
-
-/* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */
-extern int	walprop_flush(WalProposerConn *conn);
-
-/* Re-exported PQfinish */
-extern void walprop_finish(WalProposerConn *conn);
+} WalProposerConnStatusType;

 /*
- * Ergonomic wrapper around PGgetCopyData
- *
- * Reads a CopyData block from a safekeeper, setting *amount to the number
- * of bytes returned.
- *
- * This function is allowed to assume certain properties specific to the
- * protocol with the safekeepers, so it should not be used as-is for any
- * other purpose.
- *
- * Note: If possible, using <AsyncRead> is generally preferred, because it
- * performs a bit of extra checking work that's always required and is normally
- * somewhat verbose.
+ * Collection of hooks for walproposer, to call postgres functions,
+ * read WAL and send it over the network.
 */
-extern PGAsyncReadResult walprop_async_read(WalProposerConn *conn, char **buf, int *amount);
+typedef struct walproposer_api
+{
+	/*
+	 * Get WalproposerShmemState. This is used to store information about last
+	 * elected term.
+	 */
+	WalproposerShmemState *(*get_shmem_state) (void);
+
+	/*
+	 * Start receiving notifications about new WAL. This is an infinite loop
+	 * which calls WalProposerBroadcast() and WalProposerPoll() to send the
+	 * WAL.
+	 */
+	void		(*start_streaming) (WalProposer *wp, XLogRecPtr startpos);
+
+	/* Get pointer to the latest available WAL. */
+	XLogRecPtr	(*get_flush_rec_ptr) (void);
+
+	/* Get current time. */
+	TimestampTz (*get_current_timestamp) (void);
+
+	/* Get postgres timeline. */
+	TimeLineID	(*get_timeline_id) (void);
+
+	/* Current error message, aka PQerrorMessage. */
+	char	   *(*conn_error_message) (WalProposerConn *conn);
+
+	/* Connection status, aka PQstatus. */
+	WalProposerConnStatusType (*conn_status) (WalProposerConn *conn);
+
+	/* Start the connection, aka PQconnectStart. */
+	WalProposerConn *(*conn_connect_start) (char *conninfo);
+
+	/* Poll an asynchronous connection, aka PQconnectPoll. */
+	WalProposerConnectPollStatusType (*conn_connect_poll) (WalProposerConn *conn);
+
+	/* Send a blocking SQL query, aka PQsendQuery. */
+	bool		(*conn_send_query) (WalProposerConn *conn, char *query);
+
+	/* Read the query result, aka PQgetResult. */
+	WalProposerExecStatusType (*conn_get_query_result) (WalProposerConn *conn);
+
+	/* Flush buffer to the network, aka PQflush. */
+	int			(*conn_flush) (WalProposerConn *conn);
+
+	/* Close the connection, aka PQfinish. */
+	void		(*conn_finish) (WalProposerConn *conn);
+
+	/* Try to read CopyData message, aka PQgetCopyData. */
+	PGAsyncReadResult (*conn_async_read) (WalProposerConn *conn, char **buf, int *amount);
+
+	/* Try to write CopyData message, aka PQputCopyData. */
+	PGAsyncWriteResult (*conn_async_write) (WalProposerConn *conn, void const *buf, size_t size);
+
+	/* Blocking CopyData write, aka PQputCopyData + PQflush. */
+	bool		(*conn_blocking_write) (WalProposerConn *conn, void const *buf, size_t size);
+
+	/* Download WAL from startpos to endpos and make it available locally. */
+	bool		(*recovery_download) (Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);
+
+	/* Read WAL from disk to buf. */
+	void		(*wal_read) (XLogReaderState *state, char *buf, XLogRecPtr startptr, Size count);
+
+	/* Allocate WAL reader. */
+	XLogReaderState *(*wal_reader_allocate) (void);
+
+	/* Deallocate event set. */
+	void		(*free_event_set) (void);
+
+	/* Initialize event set. */
+	void		(*init_event_set) (int n_safekeepers);
+
+	/* Update events for an existing safekeeper connection. */
+	void		(*update_event_set) (Safekeeper *sk, uint32 events);
+
+	/* Add a new safekeeper connection to the event set. */
+	void		(*add_safekeeper_event_set) (Safekeeper *sk, uint32 events);
+
+	/*
+	 * Wait until some event happens: - timeout is reached - socket event for
+	 * safekeeper connection - new WAL is available
+	 *
+	 * Returns 0 if timeout is reached, 1 if some event happened. Updates
+	 * events mask to indicate events and sets sk to the safekeeper which has
+	 * an event.
+	 */
+	int			(*wait_event_set) (long timeout, Safekeeper **sk, uint32 *events);
+
+	/* Read random bytes. */
+	bool		(*strong_random) (void *buf, size_t len);
+
+	/*
+	 * Get a basebackup LSN. Used to cross-validate with the latest available
+	 * LSN on the safekeepers.
+	 */
+	XLogRecPtr	(*get_redo_start_lsn) (void);
+
+	/*
+	 * Finish sync safekeepers with the given LSN. This function should not
+	 * return and should exit the program.
+	 */
+	void		(*finish_sync_safekeepers) (XLogRecPtr lsn);
+
+	/*
+	 * Called after every new message from the safekeeper. Used to propagate
+	 * backpressure feedback and to confirm WAL persistence (has been commited
+	 * on the quorum of safekeepers).
+	 */
+	void		(*process_safekeeper_feedback) (WalProposer *wp, XLogRecPtr commitLsn);
+
+	/*
+	 * Called on peer_horizon_lsn updates. Used to advance replication slot
+	 * and to free up disk space by deleting unnecessary WAL.
+	 */
+	void		(*confirm_wal_streamed) (XLogRecPtr lsn);
+} walproposer_api;

 /*
- * Ergonomic wrapper around PQputCopyData + PQflush
- *
- * Starts to write a CopyData block to a safekeeper.
- *
- * For information on the meaning of return codes, refer to PGAsyncWriteResult.
+ * Configuration of the WAL proposer.
 */
-extern PGAsyncWriteResult walprop_async_write(WalProposerConn *conn, void const *buf, size_t size);
+typedef struct WalProposerConfig
+{
+	/* hex-encoded TenantId cstr */
+	char	   *neon_tenant;
+
+	/* hex-encoded TimelineId cstr */
+	char	   *neon_timeline;
+
+	/*
+	 * Comma-separated list of safekeepers, in the following format:
+	 * host1:port1,host2:port2,host3:port3
+	 *
+	 * This cstr should be editable.
+	 */
+	char	   *safekeepers_list;
+
+	/*
+	 * WalProposer reconnects to offline safekeepers once in this interval.
+	 * Time is in milliseconds.
+	 */
+	int			safekeeper_reconnect_timeout;
+
+	/*
+	 * WalProposer terminates the connection if it doesn't receive any message
+	 * from the safekeeper in this interval. Time is in milliseconds.
+	 */
+	int			safekeeper_connection_timeout;
+
+	/*
+	 * WAL segment size. Will be passed to safekeepers in greet request. Also
+	 * used to detect page headers.
+	 */
+	int			wal_segment_size;
+
+	/*
+	 * If safekeeper was started in sync mode, walproposer will not subscribe
+	 * for new WAL and will exit when quorum of safekeepers will be synced to
+	 * the latest available LSN.
+	 */
+	bool		syncSafekeepers;
+
+	/* Will be passed to safekeepers in greet request. */
+	uint64		systemId;
+} WalProposerConfig;
+

 /*
- * Blocking equivalent to walprop_async_write_fn
- *
- * Returns 'true' if successful, 'false' on failure.
+ * WAL proposer state.
 */
-extern bool walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size);
+typedef struct WalProposer
+{
+	WalProposerConfig *config;
+	int			n_safekeepers;

-extern uint64 BackpressureThrottlingTime(void);
+	/* (n_safekeepers / 2) + 1 */
+	int			quorum;
+
+	Safekeeper	safekeeper[MAX_SAFEKEEPERS];
+
+	/* WAL has been generated up to this point */
+	XLogRecPtr	availableLsn;
+
+	/* last commitLsn broadcasted to safekeepers */
+	XLogRecPtr	lastSentCommitLsn;
+
+	ProposerGreeting greetRequest;
+
+	/* Vote request for safekeeper */
+	VoteRequest voteRequest;
+
+	/*
+	 * Minimal LSN which may be needed for recovery of some safekeeper,
+	 * record-aligned (first record which might not yet received by someone).
+	 */
+	XLogRecPtr	truncateLsn;
+
+	/*
+	 * Term of the proposer. We want our term to be highest and unique, so we
+	 * collect terms from safekeepers quorum, choose max and +1. After that
+	 * our term is fixed and must not change. If we observe that some
+	 * safekeeper has higher term, it means that we have another running
+	 * compute, so we must stop immediately.
+	 */
+	term_t		propTerm;
+
+	/* term history of the proposer */
+	TermHistory propTermHistory;
+
+	/* epoch start lsn of the proposer */
+	XLogRecPtr	propEpochStartLsn;
+
+	/* Most advanced acceptor epoch */
+	term_t		donorEpoch;
+
+	/* Most advanced acceptor */
+	int			donor;
+
+	/* timeline globally starts at this LSN */
+	XLogRecPtr	timelineStartLsn;
+
+	/* number of votes collected from safekeepers */
+	int			n_votes;
+
+	/* number of successful connections over the lifetime of walproposer */
+	int			n_connected;
+
+	/*
+	 * Timestamp of the last reconnection attempt. Related to
+	 * config->safekeeper_reconnect_timeout
+	 */
+	TimestampTz last_reconnect_attempt;
+
+	walproposer_api api;
+} WalProposer;
+
+extern WalProposer *WalProposerCreate(WalProposerConfig *config, walproposer_api api);
+extern void WalProposerStart(WalProposer *wp);
+extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPtr endpos);
+extern void WalProposerPoll(WalProposer *wp);
+extern void ParsePageserverFeedbackMessage(StringInfo reply_message,
+										   PageserverFeedback *rf);

 #endif							/* __NEON_WALPROPOSER_H__ */
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
--- a/pgxn/neon/walproposer_utils.c
+++ b/pgxn/neon/walproposer_utils.c
@@ -1,659 +0,0 @@
-#include "postgres.h"
-
-#include "access/timeline.h"
-#include "access/xlogutils.h"
-#include "common/logging.h"
-#include "common/ip.h"
-#include "funcapi.h"
-#include "libpq/libpq.h"
-#include "libpq/pqformat.h"
-#include "miscadmin.h"
-#include "postmaster/interrupt.h"
-#include "replication/slot.h"
-#include "walproposer_utils.h"
-#include "replication/walsender_private.h"
-
-#include "storage/ipc.h"
-#include "utils/builtins.h"
-#include "utils/ps_status.h"
-
-#include "libpq-fe.h"
-#include <netinet/tcp.h>
-#include <unistd.h>
-
-#if PG_VERSION_NUM >= 150000
-#include "access/xlogutils.h"
-#include "access/xlogrecovery.h"
-#endif
-#if PG_MAJORVERSION_NUM >= 16
-#include "utils/guc.h"
-#endif
-
-/*
- * These variables are used similarly to openLogFile/SegNo,
- * but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID
- * corresponding the filename of walpropFile.
- */
-static int	walpropFile = -1;
-static TimeLineID walpropFileTLI = 0;
-static XLogSegNo walpropSegNo = 0;
-
-/* START cloned file-local variables and functions from walsender.c */
-
-/*
- * How far have we sent WAL already? This is also advertised in
- * MyWalSnd->sentPtr.  (Actually, this is the next WAL location to send.)
- */
-static XLogRecPtr sentPtr = InvalidXLogRecPtr;
-
-static void WalSndLoop(void);
-static void XLogBroadcastWalProposer(void);
-/* END cloned file-level variables and functions from walsender.c */
-
-int
-CompareLsn(const void *a, const void *b)
-{
-	XLogRecPtr	lsn1 = *((const XLogRecPtr *) a);
-	XLogRecPtr	lsn2 = *((const XLogRecPtr *) b);
-
-	if (lsn1 < lsn2)
-		return -1;
-	else if (lsn1 == lsn2)
-		return 0;
-	else
-		return 1;
-}
-
-/* Returns a human-readable string corresonding to the SafekeeperState
- *
- * The string should not be freed.
- *
- * The strings are intended to be used as a prefix to "state", e.g.:
- *
- *   elog(LOG, "currently in %s state", FormatSafekeeperState(sk->state));
- *
- * If this sort of phrasing doesn't fit the message, instead use something like:
- *
- *   elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state));
- */
-char *
-FormatSafekeeperState(SafekeeperState state)
-{
-	char	   *return_val = NULL;
-
-	switch (state)
-	{
-		case SS_OFFLINE:
-			return_val = "offline";
-			break;
-		case SS_CONNECTING_READ:
-		case SS_CONNECTING_WRITE:
-			return_val = "connecting";
-			break;
-		case SS_WAIT_EXEC_RESULT:
-			return_val = "receiving query result";
-			break;
-		case SS_HANDSHAKE_RECV:
-			return_val = "handshake (receiving)";
-			break;
-		case SS_VOTING:
-			return_val = "voting";
-			break;
-		case SS_WAIT_VERDICT:
-			return_val = "wait-for-verdict";
-			break;
-		case SS_SEND_ELECTED_FLUSH:
-			return_val = "send-announcement-flush";
-			break;
-		case SS_IDLE:
-			return_val = "idle";
-			break;
-		case SS_ACTIVE:
-			return_val = "active";
-			break;
-	}
-
-	Assert(return_val != NULL);
-
-	return return_val;
-}
-
-/* Asserts that the provided events are expected for given safekeeper's state */
-void
-AssertEventsOkForState(uint32 events, Safekeeper *sk)
-{
-	uint32		expected = SafekeeperStateDesiredEvents(sk->state);
-
-	/*
-	 * The events are in-line with what we're expecting, under two conditions:
-	 * (a) if we aren't expecting anything, `events` has no read- or
-	 * write-ready component. (b) if we are expecting something, there's
-	 * overlap (i.e. `events & expected != 0`)
-	 */
-	bool		events_ok_for_state;	/* long name so the `Assert` is more
-										 * clear later */
-
-	if (expected == WL_NO_EVENTS)
-		events_ok_for_state = ((events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) == 0);
-	else
-		events_ok_for_state = ((events & expected) != 0);
-
-	if (!events_ok_for_state)
-	{
-		/*
-		 * To give a descriptive message in the case of failure, we use elog
-		 * and then an assertion that's guaranteed to fail.
-		 */
-		elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
-			 FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state));
-		Assert(events_ok_for_state);
-	}
-}
-
-/* Returns the set of events a safekeeper in this state should be waiting on
- *
- * This will return WL_NO_EVENTS (= 0) for some events. */
-uint32
-SafekeeperStateDesiredEvents(SafekeeperState state)
-{
-	uint32		result = WL_NO_EVENTS;
-
-	/* If the state doesn't have a modifier, we can check the base state */
-	switch (state)
-	{
-			/* Connecting states say what they want in the name */
-		case SS_CONNECTING_READ:
-			result = WL_SOCKET_READABLE;
-			break;
-		case SS_CONNECTING_WRITE:
-			result = WL_SOCKET_WRITEABLE;
-			break;
-
-			/* Reading states need the socket to be read-ready to continue */
-		case SS_WAIT_EXEC_RESULT:
-		case SS_HANDSHAKE_RECV:
-		case SS_WAIT_VERDICT:
-			result = WL_SOCKET_READABLE;
-			break;
-
-			/*
-			 * Idle states use read-readiness as a sign that the connection
-			 * has been disconnected.
-			 */
-		case SS_VOTING:
-		case SS_IDLE:
-			result = WL_SOCKET_READABLE;
-			break;
-
-			/*
-			 * Flush states require write-ready for flushing. Active state
-			 * does both reading and writing.
-			 *
-			 * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We
-			 * should check sk->flushWrite here to set WL_SOCKET_WRITEABLE.
-			 */
-		case SS_SEND_ELECTED_FLUSH:
-		case SS_ACTIVE:
-			result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
-			break;
-
-			/* The offline state expects no events. */
-		case SS_OFFLINE:
-			result = WL_NO_EVENTS;
-			break;
-
-		default:
-			Assert(false);
-			break;
-	}
-
-	return result;
-}
-
-/* Returns a human-readable string corresponding to the event set
- *
- * If the events do not correspond to something set as the `events` field of a `WaitEvent`, the
- * returned string may be meaingless.
- *
- * The string should not be freed. It should also not be expected to remain the same between
- * function calls. */
-char *
-FormatEvents(uint32 events)
-{
-	static char return_str[8];
-
-	/* Helper variable to check if there's extra bits */
-	uint32		all_flags = WL_LATCH_SET
-	| WL_SOCKET_READABLE
-	| WL_SOCKET_WRITEABLE
-	| WL_TIMEOUT
-	| WL_POSTMASTER_DEATH
-	| WL_EXIT_ON_PM_DEATH
-	| WL_SOCKET_CONNECTED;
-
-	/*
-	 * The formatting here isn't supposed to be *particularly* useful -- it's
-	 * just to give an sense of what events have been triggered without
-	 * needing to remember your powers of two.
-	 */
-
-	return_str[0] = (events & WL_LATCH_SET) ? 'L' : '_';
-	return_str[1] = (events & WL_SOCKET_READABLE) ? 'R' : '_';
-	return_str[2] = (events & WL_SOCKET_WRITEABLE) ? 'W' : '_';
-	return_str[3] = (events & WL_TIMEOUT) ? 'T' : '_';
-	return_str[4] = (events & WL_POSTMASTER_DEATH) ? 'D' : '_';
-	return_str[5] = (events & WL_EXIT_ON_PM_DEATH) ? 'E' : '_';
-	return_str[5] = (events & WL_SOCKET_CONNECTED) ? 'C' : '_';
-
-	if (events & (~all_flags))
-	{
-		elog(WARNING, "Event formatting found unexpected component %d",
-			 events & (~all_flags));
-		return_str[6] = '*';
-		return_str[7] = '\0';
-	}
-	else
-		return_str[6] = '\0';
-
-	return (char *) &return_str;
-}
-
-/*
- * Convert a character which represents a hexadecimal digit to an integer.
- *
- * Returns -1 if the character is not a hexadecimal digit.
- */
-static int
-HexDecodeChar(char c)
-{
-	if (c >= '0' && c <= '9')
-		return c - '0';
-	if (c >= 'a' && c <= 'f')
-		return c - 'a' + 10;
-	if (c >= 'A' && c <= 'F')
-		return c - 'A' + 10;
-
-	return -1;
-}
-
-/*
- * Decode a hex string into a byte string, 2 hex chars per byte.
- *
- * Returns false if invalid characters are encountered; otherwise true.
- */
-bool
-HexDecodeString(uint8 *result, char *input, int nbytes)
-{
-	int			i;
-
-	for (i = 0; i < nbytes; ++i)
-	{
-		int			n1 = HexDecodeChar(input[i * 2]);
-		int			n2 = HexDecodeChar(input[i * 2 + 1]);
-
-		if (n1 < 0 || n2 < 0)
-			return false;
-		result[i] = n1 * 16 + n2;
-	}
-
-	return true;
-}
-
-/* --------------------------------
- *		pq_getmsgint32_le	- get a binary 4-byte int from a message buffer in native (LE) order
- * --------------------------------
- */
-uint32
-pq_getmsgint32_le(StringInfo msg)
-{
-	uint32		n32;
-
-	pq_copymsgbytes(msg, (char *) &n32, sizeof(n32));
-
-	return n32;
-}
-
-/* --------------------------------
- *		pq_getmsgint64	- get a binary 8-byte int from a message buffer in native (LE) order
- * --------------------------------
- */
-uint64
-pq_getmsgint64_le(StringInfo msg)
-{
-	uint64		n64;
-
-	pq_copymsgbytes(msg, (char *) &n64, sizeof(n64));
-
-	return n64;
-}
-
-/* append a binary [u]int32 to a StringInfo buffer in native (LE) order */
-void
-pq_sendint32_le(StringInfo buf, uint32 i)
-{
-	enlargeStringInfo(buf, sizeof(uint32));
-	memcpy(buf->data + buf->len, &i, sizeof(uint32));
-	buf->len += sizeof(uint32);
-}
-
-/* append a binary [u]int64 to a StringInfo buffer in native (LE) order */
-void
-pq_sendint64_le(StringInfo buf, uint64 i)
-{
-	enlargeStringInfo(buf, sizeof(uint64));
-	memcpy(buf->data + buf->len, &i, sizeof(uint64));
-	buf->len += sizeof(uint64);
-}
-
-/*
- * Write XLOG data to disk.
- */
-void
-XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr)
-{
-	int			startoff;
-	int			byteswritten;
-
-	while (nbytes > 0)
-	{
-		int			segbytes;
-
-		/* Close the current segment if it's completed */
-		if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size))
-			XLogWalPropClose(recptr);
-
-		if (walpropFile < 0)
-		{
-#if PG_VERSION_NUM >= 150000
-			/* FIXME Is it ok to use hardcoded value here? */
-			TimeLineID	tli = 1;
-#else
-			bool		use_existent = true;
-#endif
-			/* Create/use new log file */
-			XLByteToSeg(recptr, walpropSegNo, wal_segment_size);
-#if PG_VERSION_NUM >= 150000
-			walpropFile = XLogFileInit(walpropSegNo, tli);
-			walpropFileTLI = tli;
-#else
-			walpropFile = XLogFileInit(walpropSegNo, &use_existent, false);
-			walpropFileTLI = ThisTimeLineID;
-#endif
-		}
-
-		/* Calculate the start offset of the received logs */
-		startoff = XLogSegmentOffset(recptr, wal_segment_size);
-
-		if (startoff + nbytes > wal_segment_size)
-			segbytes = wal_segment_size - startoff;
-		else
-			segbytes = nbytes;
-
-		/* OK to write the logs */
-		errno = 0;
-
-		byteswritten = pg_pwrite(walpropFile, buf, segbytes, (off_t) startoff);
-		if (byteswritten <= 0)
-		{
-			char		xlogfname[MAXFNAMELEN];
-			int			save_errno;
-
-			/* if write didn't set errno, assume no disk space */
-			if (errno == 0)
-				errno = ENOSPC;
-
-			save_errno = errno;
-			XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size);
-			errno = save_errno;
-			ereport(PANIC,
-					(errcode_for_file_access(),
-					 errmsg("could not write to log segment %s "
-							"at offset %u, length %lu: %m",
-							xlogfname, startoff, (unsigned long) segbytes)));
-		}
-
-		/* Update state for write */
-		recptr += byteswritten;
-
-		nbytes -= byteswritten;
-		buf += byteswritten;
-	}
-
-	/*
-	 * Close the current segment if it's fully written up in the last cycle of
-	 * the loop.
-	 */
-	if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size))
-	{
-		XLogWalPropClose(recptr);
-	}
-}
-
-/*
- * Close the current segment.
- */
-void
-XLogWalPropClose(XLogRecPtr recptr)
-{
-	Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size));
-
-	if (close(walpropFile) != 0)
-	{
-		char		xlogfname[MAXFNAMELEN];
-
-		XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size);
-
-		ereport(PANIC,
-				(errcode_for_file_access(),
-				 errmsg("could not close log segment %s: %m",
-						xlogfname)));
-	}
-
-	walpropFile = -1;
-}
-
-/* START of cloned functions from walsender.c */
-
-/*
- * Subscribe for new WAL and stream it in the loop to safekeepers.
- *
- * At the moment, this never returns, but an ereport(ERROR) will take us back
- * to the main loop.
- */
-void
-StartProposerReplication(StartReplicationCmd *cmd)
-{
-	XLogRecPtr	FlushPtr;
-	TimeLineID	currTLI;
-
-#if PG_VERSION_NUM < 150000
-	if (ThisTimeLineID == 0)
-		ereport(ERROR,
-				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-				 errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION")));
-#endif
-
-	/*
-	 * We assume here that we're logging enough information in the WAL for
-	 * log-shipping, since this is checked in PostmasterMain().
-	 *
-	 * NOTE: wal_level can only change at shutdown, so in most cases it is
-	 * difficult for there to be WAL data that we can still see that was
-	 * written at wal_level='minimal'.
-	 */
-
-	if (cmd->slotname)
-	{
-		ReplicationSlotAcquire(cmd->slotname, true);
-		if (SlotIsLogical(MyReplicationSlot))
-			ereport(ERROR,
-					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-					 errmsg("cannot use a logical replication slot for physical replication")));
-
-		/*
-		 * We don't need to verify the slot's restart_lsn here; instead we
-		 * rely on the caller requesting the starting point to use.  If the
-		 * WAL segment doesn't exist, we'll fail later.
-		 */
-	}
-
-	/*
-	 * Select the timeline. If it was given explicitly by the client, use
-	 * that. Otherwise use the timeline of the last replayed record, which is
-	 * kept in ThisTimeLineID.
-	 *
-	 * Neon doesn't currently use PG Timelines, but it may in the future, so
-	 * we keep this code around to lighten the load for when we need it.
-	 */
-#if PG_VERSION_NUM >= 150000
-	FlushPtr = GetFlushRecPtr(&currTLI);
-#else
-	FlushPtr = GetFlushRecPtr();
-	currTLI = ThisTimeLineID;
-#endif
-
-	/*
-	 * When we first start replication the standby will be behind the
-	 * primary. For some applications, for example synchronous
-	 * replication, it is important to have a clear state for this initial
-	 * catchup mode, so we can trigger actions when we change streaming
-	 * state later. We may stay in this state for a long time, which is
-	 * exactly why we want to be able to monitor whether or not we are
-	 * still here.
-	 */
-	WalSndSetState(WALSNDSTATE_CATCHUP);
-
-	/*
-	 * Don't allow a request to stream from a future point in WAL that
-	 * hasn't been flushed to disk in this server yet.
-	 */
-	if (FlushPtr < cmd->startpoint)
-	{
-		ereport(ERROR,
-				(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
-						LSN_FORMAT_ARGS(cmd->startpoint),
-						LSN_FORMAT_ARGS(FlushPtr))));
-	}
-
-	/* Start streaming from the requested point */
-	sentPtr = cmd->startpoint;
-
-	/* Initialize shared memory status, too */
-	SpinLockAcquire(&MyWalSnd->mutex);
-	MyWalSnd->sentPtr = sentPtr;
-	SpinLockRelease(&MyWalSnd->mutex);
-
-	SyncRepInitConfig();
-
-	/* Infinite send loop, never returns */
-	WalSndLoop();
-
-	WalSndSetState(WALSNDSTATE_STARTUP);
-
-	if (cmd->slotname)
-		ReplicationSlotRelease();
-}
-
-/*
- * Main loop that waits for LSN updates and calls the walproposer.
- * Synchronous replication sets latch in WalSndWakeup at walsender.c
- */
-static void
-WalSndLoop(void)
-{
-	/* Clear any already-pending wakeups */
-	ResetLatch(MyLatch);
-
-	for (;;)
-	{
-		CHECK_FOR_INTERRUPTS();
-
-		XLogBroadcastWalProposer();
-
-		if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
-			WalSndSetState(WALSNDSTATE_STREAMING);
-		WalProposerPoll();
-	}
-}
-
-/*
- * Notify walproposer about the new WAL position.
- */
-static void
-XLogBroadcastWalProposer(void)
-{
-	XLogRecPtr	startptr;
-	XLogRecPtr	endptr;
-
-	/* Start from the last sent position */
-	startptr = sentPtr;
-
-	/*
-	 * Streaming the current timeline on a primary.
-	 *
-	 * Attempt to send all data that's already been written out and
-	 * fsync'd to disk.  We cannot go further than what's been written out
-	 * given the current implementation of WALRead().  And in any case
-	 * it's unsafe to send WAL that is not securely down to disk on the
-	 * primary: if the primary subsequently crashes and restarts, standbys
-	 * must not have applied any WAL that got lost on the primary.
-	 */
-#if PG_VERSION_NUM >= 150000
-	endptr = GetFlushRecPtr(NULL);
-#else
-	endptr = GetFlushRecPtr();
-#endif
-
-	/*
-	 * Record the current system time as an approximation of the time at which
-	 * this WAL location was written for the purposes of lag tracking.
-	 *
-	 * In theory we could make XLogFlush() record a time in shmem whenever WAL
-	 * is flushed and we could get that time as well as the LSN when we call
-	 * GetFlushRecPtr() above (and likewise for the cascading standby
-	 * equivalent), but rather than putting any new code into the hot WAL path
-	 * it seems good enough to capture the time here.  We should reach this
-	 * after XLogFlush() runs WalSndWakeupProcessRequests(), and although that
-	 * may take some time, we read the WAL flush pointer and take the time
-	 * very close to together here so that we'll get a later position if it is
-	 * still moving.
-	 *
-	 * Because LagTrackerWrite ignores samples when the LSN hasn't advanced,
-	 * this gives us a cheap approximation for the WAL flush time for this
-	 * LSN.
-	 *
-	 * Note that the LSN is not necessarily the LSN for the data contained in
-	 * the present message; it's the end of the WAL, which might be further
-	 * ahead.  All the lag tracking machinery cares about is finding out when
-	 * that arbitrary LSN is eventually reported as written, flushed and
-	 * applied, so that it can measure the elapsed time.
-	 */
-	LagTrackerWrite(endptr, GetCurrentTimestamp());
-
-	/* Do we have any work to do? */
-	Assert(startptr <= endptr);
-	if (endptr <= startptr)
-		return;
-
-	WalProposerBroadcast(startptr, endptr);
-	sentPtr = endptr;
-
-	/* Update shared memory status */
-	{
-		WalSnd	   *walsnd = MyWalSnd;
-
-		SpinLockAcquire(&walsnd->mutex);
-		walsnd->sentPtr = sentPtr;
-		SpinLockRelease(&walsnd->mutex);
-	}
-
-	/* Report progress of XLOG streaming in PS display */
-	if (update_process_title)
-	{
-		char		activitymsg[50];
-
-		snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X",
-				 LSN_FORMAT_ARGS(sentPtr));
-		set_ps_display(activitymsg);
-	}
-}
--- a/pgxn/neon/walproposer_utils.h
+++ b/pgxn/neon/walproposer_utils.h
@@ -1,19 +0,0 @@
-#ifndef __NEON_WALPROPOSER_UTILS_H__
-#define __NEON_WALPROPOSER_UTILS_H__
-
-#include "walproposer.h"
-
-int			CompareLsn(const void *a, const void *b);
-char	   *FormatSafekeeperState(SafekeeperState state);
-void		AssertEventsOkForState(uint32 events, Safekeeper *sk);
-uint32		SafekeeperStateDesiredEvents(SafekeeperState state);
-char	   *FormatEvents(uint32 events);
-bool		HexDecodeString(uint8 *result, char *input, int nbytes);
-uint32		pq_getmsgint32_le(StringInfo msg);
-uint64		pq_getmsgint64_le(StringInfo msg);
-void		pq_sendint32_le(StringInfo buf, uint32 i);
-void		pq_sendint64_le(StringInfo buf, uint64 i);
-void		XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr);
-void		XLogWalPropClose(XLogRecPtr recptr);
-
-#endif							/* __NEON_WALPROPOSER_UTILS_H__ */
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.

 [[package]]
 name = "aiohttp"
@@ -887,34 +887,34 @@ files = [

 [[package]]
 name = "cryptography"
-version = "41.0.3"
+version = "41.0.4"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:652627a055cb52a84f8c448185922241dd5217443ca194d5739b44612c5e6507"},
-    {file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8f09daa483aedea50d249ef98ed500569841d6498aa9c9f4b0531b9964658922"},
-    {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4fd871184321100fb400d759ad0cddddf284c4b696568204d281c902fc7b0d81"},
-    {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84537453d57f55a50a5b6835622ee405816999a7113267739a1b4581f83535bd"},
-    {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:3fb248989b6363906827284cd20cca63bb1a757e0a2864d4c1682a985e3dca47"},
-    {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:42cb413e01a5d36da9929baa9d70ca90d90b969269e5a12d39c1e0d475010116"},
-    {file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:aeb57c421b34af8f9fe830e1955bf493a86a7996cc1338fe41b30047d16e962c"},
-    {file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6af1c6387c531cd364b72c28daa29232162010d952ceb7e5ca8e2827526aceae"},
-    {file = "cryptography-41.0.3-cp37-abi3-win32.whl", hash = "sha256:0d09fb5356f975974dbcb595ad2d178305e5050656affb7890a1583f5e02a306"},
-    {file = "cryptography-41.0.3-cp37-abi3-win_amd64.whl", hash = "sha256:a983e441a00a9d57a4d7c91b3116a37ae602907a7618b882c8013b5762e80574"},
-    {file = "cryptography-41.0.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5259cb659aa43005eb55a0e4ff2c825ca111a0da1814202c64d28a985d33b087"},
-    {file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:67e120e9a577c64fe1f611e53b30b3e69744e5910ff3b6e97e935aeb96005858"},
-    {file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:7efe8041897fe7a50863e51b77789b657a133c75c3b094e51b5e4b5cec7bf906"},
-    {file = "cryptography-41.0.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ce785cf81a7bdade534297ef9e490ddff800d956625020ab2ec2780a556c313e"},
-    {file = "cryptography-41.0.3-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:57a51b89f954f216a81c9d057bf1a24e2f36e764a1ca9a501a6964eb4a6800dd"},
-    {file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c2f0d35703d61002a2bbdcf15548ebb701cfdd83cdc12471d2bae80878a4207"},
-    {file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:23c2d778cf829f7d0ae180600b17e9fceea3c2ef8b31a99e3c694cbbf3a24b84"},
-    {file = "cryptography-41.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:95dd7f261bb76948b52a5330ba5202b91a26fbac13ad0e9fc8a3ac04752058c7"},
-    {file = "cryptography-41.0.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:41d7aa7cdfded09b3d73a47f429c298e80796c8e825ddfadc84c8a7f12df212d"},
-    {file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d0d651aa754ef58d75cec6edfbd21259d93810b73f6ec246436a21b7841908de"},
-    {file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:ab8de0d091acbf778f74286f4989cf3d1528336af1b59f3e5d2ebca8b5fe49e1"},
-    {file = "cryptography-41.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a74fbcdb2a0d46fe00504f571a2a540532f4c188e6ccf26f1f178480117b33c4"},
-    {file = "cryptography-41.0.3.tar.gz", hash = "sha256:6d192741113ef5e30d89dcb5b956ef4e1578f304708701b8b73d38e3e1461f34"},
+    {file = "cryptography-41.0.4-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:80907d3faa55dc5434a16579952ac6da800935cd98d14dbd62f6f042c7f5e839"},
+    {file = "cryptography-41.0.4-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:35c00f637cd0b9d5b6c6bd11b6c3359194a8eba9c46d4e875a3660e3b400005f"},
+    {file = "cryptography-41.0.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cecfefa17042941f94ab54f769c8ce0fe14beff2694e9ac684176a2535bf9714"},
+    {file = "cryptography-41.0.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e40211b4923ba5a6dc9769eab704bdb3fbb58d56c5b336d30996c24fcf12aadb"},
+    {file = "cryptography-41.0.4-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:23a25c09dfd0d9f28da2352503b23e086f8e78096b9fd585d1d14eca01613e13"},
+    {file = "cryptography-41.0.4-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:2ed09183922d66c4ec5fdaa59b4d14e105c084dd0febd27452de8f6f74704143"},
+    {file = "cryptography-41.0.4-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:5a0f09cefded00e648a127048119f77bc2b2ec61e736660b5789e638f43cc397"},
+    {file = "cryptography-41.0.4-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:9eeb77214afae972a00dee47382d2591abe77bdae166bda672fb1e24702a3860"},
+    {file = "cryptography-41.0.4-cp37-abi3-win32.whl", hash = "sha256:3b224890962a2d7b57cf5eeb16ccaafba6083f7b811829f00476309bce2fe0fd"},
+    {file = "cryptography-41.0.4-cp37-abi3-win_amd64.whl", hash = "sha256:c880eba5175f4307129784eca96f4e70b88e57aa3f680aeba3bab0e980b0f37d"},
+    {file = "cryptography-41.0.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:004b6ccc95943f6a9ad3142cfabcc769d7ee38a3f60fb0dddbfb431f818c3a67"},
+    {file = "cryptography-41.0.4-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:86defa8d248c3fa029da68ce61fe735432b047e32179883bdb1e79ed9bb8195e"},
+    {file = "cryptography-41.0.4-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:37480760ae08065437e6573d14be973112c9e6dcaf5f11d00147ee74f37a3829"},
+    {file = "cryptography-41.0.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:b5f4dfe950ff0479f1f00eda09c18798d4f49b98f4e2006d644b3301682ebdca"},
+    {file = "cryptography-41.0.4-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7e53db173370dea832190870e975a1e09c86a879b613948f09eb49324218c14d"},
+    {file = "cryptography-41.0.4-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:5b72205a360f3b6176485a333256b9bcd48700fc755fef51c8e7e67c4b63e3ac"},
+    {file = "cryptography-41.0.4-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:93530900d14c37a46ce3d6c9e6fd35dbe5f5601bf6b3a5c325c7bffc030344d9"},
+    {file = "cryptography-41.0.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:efc8ad4e6fc4f1752ebfb58aefece8b4e3c4cae940b0994d43649bdfce8d0d4f"},
+    {file = "cryptography-41.0.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c3391bd8e6de35f6f1140e50aaeb3e2b3d6a9012536ca23ab0d9c35ec18c8a91"},
+    {file = "cryptography-41.0.4-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:0d9409894f495d465fe6fda92cb70e8323e9648af912d5b9141d616df40a87b8"},
+    {file = "cryptography-41.0.4-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:8ac4f9ead4bbd0bc8ab2d318f97d85147167a488be0e08814a37eb2f439d5cf6"},
+    {file = "cryptography-41.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:047c4603aeb4bbd8db2756e38f5b8bd7e94318c047cfe4efeb5d715e08b49311"},
+    {file = "cryptography-41.0.4.tar.gz", hash = "sha256:7febc3094125fc126a7f6fb1f420d0da639f3f32cb15c8ff0dc3997c4549f51a"},
 ]

 [package.dependencies]
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -121,7 +121,7 @@ async fn collect_metrics_iteration(

    let current_metrics = gather_proxy_io_bytes_per_client();

-    let metrics_to_send: Vec<Event<Ids>> = current_metrics
+    let metrics_to_send: Vec<Event<Ids, &'static str>> = current_metrics
        .iter()
        .filter_map(|(curr_key, (curr_val, curr_time))| {
            let mut start_time = *curr_time;
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -28,10 +28,11 @@ use tracing::{error, info, info_span, warn, Instrument};
 use utils::measured_stream::MeasuredStream;

 /// Number of times we should retry the `/proxy_wake_compute` http request.
-/// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n
-pub const NUM_RETRIES_CONNECT: u32 = 10;
+/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0
+pub const NUM_RETRIES_CONNECT: u32 = 16;
 const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
-const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100);
+const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25);
+const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2;

 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 const ERR_PROTO_VIOLATION: &str = "protocol violation";
@@ -553,8 +554,7 @@ impl ShouldRetry for compute::ConnectionError {
 }

 pub fn retry_after(num_retries: u32) -> time::Duration {
-    // 1.5 seems to be an ok growth factor heuristic
-    BASE_RETRY_WAIT_DURATION.mul_f64(1.5_f64.powi(num_retries as i32))
+    BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1))
 }

 /// Finish client connection initialization: confirm auth success, send params, etc.
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -303,7 +303,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
 #[test]
 fn connect_compute_total_wait() {
    let mut total_wait = tokio::time::Duration::ZERO;
-    for num_retries in 1..10 {
+    for num_retries in 1..NUM_RETRIES_CONNECT {
        total_wait += retry_after(num_retries);
    }
    assert!(total_wait < tokio::time::Duration::from_secs(12));
@@ -494,11 +494,11 @@ async fn connect_to_compute_non_retry_2() {
 /// Retry for at most `NUM_RETRIES_CONNECT` times.
 #[tokio::test]
 async fn connect_to_compute_non_retry_3() {
-    assert_eq!(NUM_RETRIES_CONNECT, 10);
+    assert_eq!(NUM_RETRIES_CONNECT, 16);
    use ConnectAction::*;
    let mechanism = TestConnectMechanism::new(vec![
-        Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
-        /* the 11th time */ Retry,
+        Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
+        Retry, Retry, Retry, Retry, /* the 17th time */ Retry,
    ]);
    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
    connect_to_compute(&mechanism, cache, &extra, &creds)
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.72.0"
+channel = "1.72.1"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
--- a/scripts/download_basebackup.py
+++ b/scripts/download_basebackup.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+#
+# Script to download the basebackup from a pageserver to a tar file.
+#
+# This can be useful in disaster recovery.
+#
+import argparse
+
+import psycopg2
+from psycopg2.extensions import connection as PgConnection
+
+
+def main(args: argparse.Namespace):
+    pageserver_connstr = args.pageserver_connstr
+    tenant_id = args.tenant
+    timeline_id = args.timeline
+    lsn = args.lsn
+    output_path = args.output_path
+
+    psconn: PgConnection = psycopg2.connect(pageserver_connstr)
+    psconn.autocommit = True
+
+    output = open(output_path, "wb")
+
+    with psconn.cursor() as pscur:
+        pscur.copy_expert(f"basebackup {tenant_id} {timeline_id} {lsn}", output)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--tenant-id",
+        dest="tenant",
+        required=True,
+        help="Id of the tenant",
+    )
+    parser.add_argument(
+        "--timeline-id",
+        dest="timeline",
+        required=True,
+        help="Id of the timeline",
+    )
+    parser.add_argument(
+        "--lsn",
+        dest="lsn",
+        required=True,
+        help="LSN to take the basebackup at",
+    )
+    parser.add_argument(
+        "--pageserver-connstr",
+        dest="pageserver_connstr",
+        required=True,
+        help="libpq connection string of the pageserver",
+    )
+    parser.add_argument(
+        "--output",
+        dest="output_path",
+        required=True,
+        help="output path to write the basebackup to",
+    )
+    args = parser.parse_args()
+    main(args)
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -1,6 +1,7 @@
 pytest_plugins = (
    "fixtures.pg_version",
    "fixtures.parametrize",
+    "fixtures.httpserver",
    "fixtures.neon_fixtures",
    "fixtures.benchmark_fixture",
    "fixtures.pg_stats",
--- a/test_runner/fixtures/httpserver.py
+++ b/test_runner/fixtures/httpserver.py
@@ -0,0 +1,45 @@
+from typing import Tuple
+
+import pytest
+from pytest_httpserver import HTTPServer
+
+# TODO: mypy fails with:
+#  Module "fixtures.neon_fixtures" does not explicitly export attribute "PortDistributor"  [attr-defined]
+# from fixtures.neon_fixtures import PortDistributor
+
+# compared to the fixtures from pytest_httpserver with same names, these are
+# always function scoped, so you can check and stop the server in tests.
+
+
+@pytest.fixture(scope="function")
+def httpserver_ssl_context():
+    return None
+
+
+@pytest.fixture(scope="function")
+def make_httpserver(httpserver_listen_address, httpserver_ssl_context):
+    host, port = httpserver_listen_address
+    if not host:
+        host = HTTPServer.DEFAULT_LISTEN_HOST
+    if not port:
+        port = HTTPServer.DEFAULT_LISTEN_PORT
+
+    server = HTTPServer(host=host, port=port, ssl_context=httpserver_ssl_context)
+    server.start()
+    yield server
+    server.clear()
+    if server.is_running():
+        server.stop()
+
+
+@pytest.fixture(scope="function")
+def httpserver(make_httpserver):
+    server = make_httpserver
+    yield server
+    server.clear()
+
+
+@pytest.fixture(scope="function")
+def httpserver_listen_address(port_distributor) -> Tuple[str, int]:
+    port = port_distributor.get_port()
+    return ("localhost", port)
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -223,12 +223,6 @@ def port_distributor(worker_base_port: int, worker_port_num: int) -> PortDistrib
    return PortDistributor(base_port=worker_base_port, port_number=worker_port_num)


-@pytest.fixture(scope="session")
-def httpserver_listen_address(port_distributor: PortDistributor):
-    port = port_distributor.get_port()
-    return ("localhost", port)
-
-
@pytest.fixture(scope="function")
 def default_broker(
    port_distributor: PortDistributor,
@@ -853,18 +847,6 @@ class NeonEnv:
        """Get list of safekeeper endpoints suitable for safekeepers GUC"""
        return ",".join(f"localhost:{wa.port.pg}" for wa in self.safekeepers)

-    def timeline_dir(
-        self, tenant_id: TenantId, timeline_id: TimelineId, pageserver_id: Optional[int] = None
-    ) -> Path:
-        """Get a timeline directory's path based on the repo directory of the test environment"""
-        return (
-            self.tenant_dir(tenant_id, pageserver_id=pageserver_id) / "timelines" / str(timeline_id)
-        )
-
-    def tenant_dir(self, tenant_id: TenantId, pageserver_id: Optional[int] = None) -> Path:
-        """Get a tenant directory's path based on the repo directory of the test environment"""
-        return self.get_pageserver(pageserver_id).workdir / "tenants" / str(tenant_id)
-
    def get_pageserver_version(self) -> str:
        bin_pageserver = str(self.neon_binpath / "pageserver")
        res = subprocess.run(
@@ -1499,6 +1481,16 @@ class NeonAttachmentService:
            self.running = False
        return self

+    def attach_hook(self, tenant_id: TenantId, pageserver_id: int) -> int:
+        response = requests.post(
+            f"{self.env.control_plane_api}/attach_hook",
+            json={"tenant_id": str(tenant_id), "pageserver_id": pageserver_id},
+        )
+        response.raise_for_status()
+        gen = response.json()["gen"]
+        assert isinstance(gen, int)
+        return gen
+
    def __enter__(self) -> "NeonAttachmentService":
        return self

@@ -1586,6 +1578,21 @@ class NeonPageserver(PgProtocol):
            '.*registered custom resource manager "neon".*',
        ]

+    def timeline_dir(self, tenant_id: TenantId, timeline_id: Optional[TimelineId] = None) -> Path:
+        """Get a timeline directory's path based on the repo directory of the test environment"""
+        if timeline_id is None:
+            return self.tenant_dir(tenant_id) / "timelines"
+        return self.tenant_dir(tenant_id) / "timelines" / str(timeline_id)
+
+    def tenant_dir(
+        self,
+        tenant_id: Optional[TenantId] = None,
+    ) -> Path:
+        """Get a tenant directory's path based on the repo directory of the test environment"""
+        if tenant_id is None:
+            return self.workdir / "tenants"
+        return self.workdir / "tenants" / str(tenant_id)
+
    def start(
        self,
        overrides: Tuple[str, ...] = (),
@@ -1692,12 +1699,7 @@ class NeonPageserver(PgProtocol):
        to call into the pageserver HTTP client.
        """
        if self.env.attachment_service is not None:
-            response = requests.post(
-                f"{self.env.control_plane_api}/attach_hook",
-                json={"tenant_id": str(tenant_id), "pageserver_id": self.id},
-            )
-            response.raise_for_status()
-            generation = response.json()["gen"]
+            generation = self.env.attachment_service.attach_hook(tenant_id, self.id)
        else:
            generation = None

@@ -2136,6 +2138,28 @@ class NeonProxy(PgProtocol):
    def _wait_until_ready(self):
        requests.get(f"http://{self.host}:{self.http_port}/v1/status")

+    def http_query(self, query, args, **kwargs):
+        # TODO maybe use default values if not provided
+        user = kwargs["user"]
+        password = kwargs["password"]
+        expected_code = kwargs.get("expected_code")
+
+        connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres"
+        response = requests.post(
+            f"https://{self.domain}:{self.external_http_port}/sql",
+            data=json.dumps({"query": query, "params": args}),
+            headers={
+                "Content-Type": "application/sql",
+                "Neon-Connection-String": connstr,
+                "Neon-Pool-Opt-In": "true",
+            },
+            verify=str(self.test_output_dir / "proxy.crt"),
+        )
+
+        if expected_code is not None:
+            assert response.status_code == kwargs["expected_code"], f"response: {response.json()}"
+        return response.json()
+
    def get_metrics(self) -> str:
        request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics")
        request_result.raise_for_status()
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -620,3 +620,8 @@ class PageserverHttpClient(requests.Session):
            },
        )
        self.verbose_error(res)
+
+    def deletion_queue_flush(self, execute: bool = False):
+        self.put(
+            f"http://localhost:{self.port}/v1/deletion_queue/flush?execute={'true' if execute else 'false'}"
+        ).raise_for_status()
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -236,15 +236,27 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
    response = list_prefix(neon_env_builder, prefix)
    keys = response["KeyCount"]
    objects = response.get("Contents", [])
+    common_prefixes = response.get("CommonPrefixes", [])

-    if keys != 0 and len(objects) == 0:
-        # this has been seen in one case with mock_s3:
-        # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4938/6000769714/index.html#suites/3556ed71f2d69272a7014df6dcb02317/ca01e4f4d8d9a11f
-        # looking at moto impl, it might be there's a race with common prefix (sub directory) not going away with deletes
-        common_prefixes = response.get("CommonPrefixes", [])
-        log.warn(
-            f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}"
-        )
+    remote_storage = neon_env_builder.pageserver_remote_storage
+    is_mock_s3 = isinstance(remote_storage, S3Storage) and not remote_storage.cleanup
+
+    if is_mock_s3:
+        if keys == 1 and len(objects) == 0 and len(common_prefixes) == 1:
+            # this has been seen in the wild by tests with the below contradicting logging
+            # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5322/6207777020/index.html#suites/3556ed71f2d69272a7014df6dcb02317/53b5c368b5a68865
+            # this seems like a mock_s3 issue
+            log.warn(
+                f"contrading ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0"
+            )
+            keys = 0
+        elif keys != 0 and len(objects) == 0:
+            # this has been seen in one case with mock_s3:
+            # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4938/6000769714/index.html#suites/3556ed71f2d69272a7014df6dcb02317/ca01e4f4d8d9a11f
+            # looking at moto impl, it might be there's a race with common prefix (sub directory) not going away with deletes
+            log.warn(
+                f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}"
+            )

    assert keys == 0, f"remote dir with prefix {prefix} is not empty after deletion: {objects}"

@@ -255,7 +267,7 @@ def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional


 def list_prefix(
-    neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None
+    neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None, delimiter: str = "/"
 ) -> ListObjectsV2OutputTypeDef:
    """
    Note that this function takes into account prefix_in_bucket.
@@ -275,7 +287,7 @@ def list_prefix(

    # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
    response = remote.client.list_objects_v2(
-        Delimiter="/",
+        Delimiter=delimiter,
        Bucket=remote.bucket_name,
        Prefix=prefix,
    )
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -115,6 +115,8 @@ class S3Storage:
    prefix_in_bucket: str
    client: S3Client
    cleanup: bool
+    """Is this MOCK_S3 (false) or REAL_S3 (true)"""
+    real: bool
    endpoint: Optional[str] = None

    def access_env_vars(self) -> Dict[str, str]:
@@ -265,6 +267,7 @@ class RemoteStorageKind(str, enum.Enum):
                prefix_in_bucket="",
                client=client,
                cleanup=False,
+                real=False,
            )

        assert self == RemoteStorageKind.REAL_S3
@@ -300,6 +303,7 @@ class RemoteStorageKind(str, enum.Enum):
            prefix_in_bucket=prefix_in_bucket,
            client=client,
            cleanup=True,
+            real=True,
        )


--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -44,7 +44,7 @@ def measure_recovery_time(env: NeonCompare):

    # Stop pageserver and remove tenant data
    env.env.pageserver.stop()
-    timeline_dir = env.env.timeline_dir(env.tenant, env.timeline)
+    timeline_dir = env.env.pageserver.timeline_dir(env.tenant, env.timeline)
    shutil.rmtree(timeline_dir)

    # Start pageserver
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -135,7 +135,7 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder)

    tenant_id = env.initial_tenant

-    timelines_dir = env.pageserver.workdir / "tenants" / str(tenant_id) / "timelines"
+    timelines_dir = env.pageserver.timeline_dir(tenant_id)
    old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
    initial_timeline_dirs = [d for d in timelines_dir.iterdir()]

@@ -166,7 +166,7 @@ def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilde

    tenant_id = env.initial_tenant

-    timelines_dir = env.pageserver.workdir / "tenants" / str(tenant_id) / "timelines"
+    timelines_dir = env.pageserver.timeline_dir(tenant_id)
    old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
    initial_timeline_dirs = [d for d in timelines_dir.iterdir()]

--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -20,7 +20,7 @@ from fixtures.pageserver.utils import (
    wait_for_last_record_lsn,
    wait_for_upload,
 )
-from fixtures.pg_version import PgVersion, skip_on_postgres
+from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, RemoteStorageUser
 from fixtures.types import Lsn
@@ -151,7 +151,6 @@ def test_create_snapshot(
    shutil.copytree(test_output_dir, compatibility_snapshot_dir)


-@skip_on_postgres(PgVersion.V16, reason="TODO: Enable after the first Postgres 16 release")
@check_ondisk_data_compatibility_if_enabled
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(after="test_create_snapshot")
@@ -209,7 +208,6 @@ def test_backward_compatibility(
    ), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"


-@skip_on_postgres(PgVersion.V16, reason="TODO: Enable after the first Postgres 16 release")
@check_ondisk_data_compatibility_if_enabled
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(after="test_create_snapshot")
--- a/test_runner/regress/test_createdropdb.py
+++ b/test_runner/regress/test_createdropdb.py
@@ -1,16 +1,22 @@
 import os
 import pathlib

+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
+from fixtures.pg_version import PgVersion
 from fixtures.utils import query_scalar


 #
 # Test CREATE DATABASE when there have been relmapper changes
 #
-def test_createdb(neon_simple_env: NeonEnv):
+@pytest.mark.parametrize("strategy", ["file_copy", "wal_log"])
+def test_createdb(neon_simple_env: NeonEnv, strategy: str):
    env = neon_simple_env
+    if env.pg_version == PgVersion.V14 and strategy == "wal_log":
+        pytest.skip("wal_log strategy not supported on PostgreSQL 14")
+
    env.neon_cli.create_branch("test_createdb", "empty")

    endpoint = env.endpoints.create_start("test_createdb")
@@ -20,7 +26,10 @@ def test_createdb(neon_simple_env: NeonEnv):
        # Cause a 'relmapper' change in the original branch
        cur.execute("VACUUM FULL pg_class")

-        cur.execute("CREATE DATABASE foodb")
+        if env.pg_version == PgVersion.V14:
+            cur.execute("CREATE DATABASE foodb")
+        else:
+            cur.execute(f"CREATE DATABASE foodb STRATEGY={strategy}")

        lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")

--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -42,12 +42,11 @@ def handle_role(dbs, roles, operation):
        raise ValueError("Invalid op")


-fail = False
-
-
-def ddl_forward_handler(request: Request, dbs: Dict[str, str], roles: Dict[str, str]) -> Response:
+def ddl_forward_handler(
+    request: Request, dbs: Dict[str, str], roles: Dict[str, str], ddl: "DdlForwardingContext"
+) -> Response:
    log.info(f"Received request with data {request.get_data(as_text=True)}")
-    if fail:
+    if ddl.fail:
        log.info("FAILING")
        return Response(status=500, response="Failed just cuz")
    if request.json is None:
@@ -72,6 +71,7 @@ class DdlForwardingContext:
        self.port = port
        self.dbs: Dict[str, str] = {}
        self.roles: Dict[str, str] = {}
+        self.fail = False
        endpoint = "/management/api/v2/roles_and_databases"
        ddl_url = f"http://{host}:{port}{endpoint}"
        self.pg.configure(
@@ -82,7 +82,7 @@ class DdlForwardingContext:
        )
        log.info(f"Listening on {ddl_url}")
        self.server.expect_request(endpoint, method="PATCH").respond_with_handler(
-            lambda request: ddl_forward_handler(request, self.dbs, self.roles)
+            lambda request: ddl_forward_handler(request, self.dbs, self.roles, self)
        )

    def __enter__(self):
@@ -103,6 +103,9 @@ class DdlForwardingContext:
    def wait(self, timeout=3):
        self.server.wait(timeout=timeout)

+    def failures(self, bool):
+        self.fail = bool
+
    def send_and_wait(self, query: str, timeout=3) -> List[Tuple[Any, ...]]:
        res = self.send(query)
        self.wait(timeout=timeout)
@@ -203,9 +206,9 @@ def test_ddl_forwarding(ddl: DdlForwardingContext):
    assert ddl.dbs == {"stork": "cork"}

    with pytest.raises(psycopg2.InternalError):
-        global fail
-        fail = True
+        ddl.failures(True)
        cur.execute("CREATE DATABASE failure WITH OWNER=cork")
        ddl.wait()

+    ddl.failures(False)
    conn.close()
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -74,11 +74,13 @@ class EvictionEnv:
    pgbench_init_lsns: Dict[TenantId, Lsn]

    def timelines_du(self) -> Tuple[int, int, int]:
-        return poor_mans_du(self.neon_env, [(tid, tlid) for tid, tlid in self.timelines])
+        return poor_mans_du(
+            self.neon_env, [(tid, tlid) for tid, tlid in self.timelines], verbose=False
+        )

    def du_by_timeline(self) -> Dict[Tuple[TenantId, TimelineId], int]:
        return {
-            (tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)])[0]
+            (tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)], verbose=True)[0]
            for tid, tlid in self.timelines
        }

@@ -89,7 +91,21 @@ class EvictionEnv:
        """
        lsn = self.pgbench_init_lsns[tenant_id]
        with self.neon_env.endpoints.create_start("main", tenant_id=tenant_id, lsn=lsn) as endpoint:
-            self.pg_bin.run(["pgbench", "-S", endpoint.connstr()])
+            # instead of using pgbench --select-only which does point selects,
+            # run full table scans for all tables
+            with endpoint.connect() as conn:
+                cur = conn.cursor()
+
+                tables_cols = {
+                    "pgbench_accounts": "abalance",
+                    "pgbench_tellers": "tbalance",
+                    "pgbench_branches": "bbalance",
+                    "pgbench_history": "delta",
+                }
+
+                for table, column in tables_cols.items():
+                    cur.execute(f"select avg({column}) from {table}")
+                    _avg = cur.fetchone()

    def pageserver_start_with_disk_usage_eviction(
        self, period, max_usage_pct, min_avail_bytes, mock_behavior
@@ -127,6 +143,19 @@ class EvictionEnv:
        self.neon_env.pageserver.allowed_errors.append(".*WARN.* disk usage still high.*")


+def human_bytes(amt: float) -> str:
+    suffixes = ["", "Ki", "Mi", "Gi"]
+
+    last = suffixes[-1]
+
+    for name in suffixes:
+        if amt < 1024 or name == last:
+            return f"{int(round(amt))} {name}B"
+        amt = amt / 1024
+
+    raise RuntimeError("unreachable")
+
+
@pytest.fixture
 def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv:
    """
@@ -215,8 +244,12 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):

    healthy_tenant_id, healthy_timeline_id = env.timelines[1]

-    broken_size_pre, _, _ = poor_mans_du(env.neon_env, [(broken_tenant_id, broken_timeline_id)])
-    healthy_size_pre, _, _ = poor_mans_du(env.neon_env, [(healthy_tenant_id, healthy_timeline_id)])
+    broken_size_pre, _, _ = poor_mans_du(
+        env.neon_env, [(broken_tenant_id, broken_timeline_id)], verbose=True
+    )
+    healthy_size_pre, _, _ = poor_mans_du(
+        env.neon_env, [(healthy_tenant_id, healthy_timeline_id)], verbose=True
+    )

    # try to evict everything, then validate that broken tenant wasn't touched
    target = broken_size_pre + healthy_size_pre
@@ -224,8 +257,12 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
    response = env.pageserver_http.disk_usage_eviction_run({"evict_bytes": target})
    log.info(f"{response}")

-    broken_size_post, _, _ = poor_mans_du(env.neon_env, [(broken_tenant_id, broken_timeline_id)])
-    healthy_size_post, _, _ = poor_mans_du(env.neon_env, [(healthy_tenant_id, healthy_timeline_id)])
+    broken_size_post, _, _ = poor_mans_du(
+        env.neon_env, [(broken_tenant_id, broken_timeline_id)], verbose=True
+    )
+    healthy_size_post, _, _ = poor_mans_du(
+        env.neon_env, [(healthy_tenant_id, healthy_timeline_id)], verbose=True
+    )

    assert broken_size_pre == broken_size_post, "broken tenant should not be touched"
    assert healthy_size_post < healthy_size_pre
@@ -366,18 +403,16 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
    du_by_timeline = env.du_by_timeline()

    # pick any tenant
-    [our_tenant, other_tenant] = list(du_by_timeline.keys())
-    (tenant_id, timeline_id) = our_tenant
+    [warm, cold] = list(du_by_timeline.keys())
+    (tenant_id, timeline_id) = warm

-    # make our tenant more recently used than the other one
+    # make picked tenant more recently used than the other one
    env.warm_up_tenant(tenant_id)

    # Build up enough pressure to require evictions from both tenants,
    # but not enough to fall into global LRU.
-    # So, set target to all occipied space, except 2*env.layer_size per tenant
-    target = (
-        du_by_timeline[other_tenant] + (du_by_timeline[our_tenant] // 2) - 2 * 2 * env.layer_size
-    )
+    # So, set target to all occupied space, except 2*env.layer_size per tenant
+    target = du_by_timeline[cold] + (du_by_timeline[warm] // 2) - 2 * 2 * env.layer_size
    response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
    log.info(f"{response}")

@@ -392,22 +427,33 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
            later_tenant_usage < du_by_timeline[tenant]
        ), "all tenants should have lost some layers"

+    warm_size = later_du_by_timeline[warm]
+
+    # bounds for warmed_size
+    warm_lower = 0.5 * du_by_timeline[warm]
+
+    # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
+    # So, check for up to 3 here.
+    warm_upper = warm_lower + 3 * env.layer_size
+
+    cold_size = later_du_by_timeline[cold]
+    cold_upper = 2 * env.layer_size
+
+    log.info(
+        f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}"
+    )
+    log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}")
+
+    assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)"
+    assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)"
+
    assert (
-        later_du_by_timeline[our_tenant] > 0.5 * du_by_timeline[our_tenant]
-    ), "our warmed up tenant should be at about half capacity, part 1"
-    assert (
-        # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
-        # So, check for up to 3 here.
-        later_du_by_timeline[our_tenant]
-        < 0.5 * du_by_timeline[our_tenant] + 3 * env.layer_size
-    ), "our warmed up tenant should be at about half capacity, part 2"
-    assert (
-        later_du_by_timeline[other_tenant] < 2 * env.layer_size
-    ), "the other tenant should be evicted to is min_resident_size, i.e., max layer file size"
+        cold_size < cold_upper
+    ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size"


 def poor_mans_du(
-    env: NeonEnv, timelines: list[Tuple[TenantId, TimelineId]]
+    env: NeonEnv, timelines: list[Tuple[TenantId, TimelineId]], verbose: bool = False
 ) -> Tuple[int, int, int]:
    """
    Disk usage, largest, smallest layer for layer files over the given (tenant, timeline) tuples;
@@ -417,7 +463,7 @@ def poor_mans_du(
    largest_layer = 0
    smallest_layer = None
    for tenant_id, timeline_id in timelines:
-        timeline_dir = env.timeline_dir(tenant_id, timeline_id)
+        timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
        assert timeline_dir.exists(), f"timeline dir does not exist: {timeline_dir}"
        total = 0
        for file in timeline_dir.iterdir():
@@ -430,9 +476,11 @@ def poor_mans_du(
                smallest_layer = min(smallest_layer, size)
            else:
                smallest_layer = size
-            log.info(f"{tenant_id}/{timeline_id} => {file.name} {size}")
+            if verbose:
+                log.info(f"{tenant_id}/{timeline_id} => {file.name} {size} ({human_bytes(size)})")

-        log.info(f"{tenant_id}/{timeline_id}: sum {total}")
+        if verbose:
+            log.info(f"{tenant_id}/{timeline_id}: sum {total} ({human_bytes(total)})")
        total_on_disk += total

    assert smallest_layer is not None or total_on_disk == 0 and largest_layer == 0
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -15,45 +15,45 @@ from fixtures.types import TimelineId

 # Test configuration
 #
-# Create a table with {num_rows} rows, and perform {updates_to_perform} random
-# UPDATEs on it, using {num_connections} separate connections.
-num_connections = 10
-num_rows = 100000
-updates_to_perform = 10000
-
-updates_performed = 0
-
-
-# Run random UPDATEs on test table
-async def update_table(endpoint: Endpoint):
-    global updates_performed
-    pg_conn = await endpoint.connect_async()
-
-    while updates_performed < updates_to_perform:
-        updates_performed += 1
-        id = random.randrange(1, num_rows)
-        await pg_conn.fetchrow(f"UPDATE foo SET counter = counter + 1 WHERE id = {id}")
-
-
-# Perform aggressive GC with 0 horizon
-async def gc(env: NeonEnv, timeline: TimelineId):
-    pageserver_http = env.pageserver.http_client()
-
-    loop = asyncio.get_running_loop()
-
-    def do_gc():
-        pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
-        pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
-
-    with concurrent.futures.ThreadPoolExecutor() as pool:
-        while updates_performed < updates_to_perform:
-            await loop.run_in_executor(pool, do_gc)
+# Create a table with {NUM_ROWS} rows, and perform {UPDATES_TO_PERFORM} random
+# UPDATEs on it, using {NUM_CONNECTIONS} separate connections.
+NUM_CONNECTIONS = 10
+NUM_ROWS = 100000
+UPDATES_TO_PERFORM = 10000


 # At the same time, run UPDATEs and GC
 async def update_and_gc(env: NeonEnv, endpoint: Endpoint, timeline: TimelineId):
    workers = []
-    for _ in range(num_connections):
+    updates_performed = 0
+
+    # Perform aggressive GC with 0 horizon
+    async def gc(env: NeonEnv, timeline: TimelineId):
+        pageserver_http = env.pageserver.http_client()
+        nonlocal updates_performed
+        global UPDATES_TO_PERFORM
+
+        loop = asyncio.get_running_loop()
+
+        def do_gc():
+            pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
+            pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
+
+        with concurrent.futures.ThreadPoolExecutor() as pool:
+            while updates_performed < UPDATES_TO_PERFORM:
+                await loop.run_in_executor(pool, do_gc)
+
+    # Run random UPDATEs on test table
+    async def update_table(endpoint: Endpoint):
+        pg_conn = await endpoint.connect_async()
+        nonlocal updates_performed
+
+        while updates_performed < UPDATES_TO_PERFORM:
+            updates_performed += 1
+            id = random.randrange(1, NUM_ROWS)
+            await pg_conn.fetchrow(f"UPDATE foo SET counter = counter + 1 WHERE id = {id}")
+
+    for _ in range(NUM_CONNECTIONS):
        workers.append(asyncio.create_task(update_table(endpoint)))
    workers.append(asyncio.create_task(gc(env, timeline)))

@@ -81,7 +81,7 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
            f"""
            INSERT INTO foo
                SELECT g, 0, 'long string to consume some space' || g
-                FROM generate_series(1, {num_rows}) g
+                FROM generate_series(1, {NUM_ROWS}) g
        """
        )
        cur.execute("CREATE INDEX ON foo(id)")
@@ -91,7 +91,7 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
        cur.execute("SELECT COUNT(*), SUM(counter) FROM foo")
        r = cur.fetchone()
        assert r is not None
-        assert r == (num_rows, updates_to_perform)
+        assert r == (NUM_ROWS, UPDATES_TO_PERFORM)


 #
@@ -99,6 +99,7 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
 def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind):
    # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
+    num_index_uploads = 0

    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)

@@ -160,5 +161,5 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind:
        log.info(f"{num_index_uploads} index uploads after GC iteration {i}")

    after = num_index_uploads
-    log.info(f"{after-before} new index uploads during test")
+    log.info(f"{after - before} new index uploads during test")
    assert after - before < 5
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -271,7 +271,7 @@ def _import(
    env.endpoints.stop_all()
    env.pageserver.stop()

-    dir_to_clear = Path(env.pageserver.workdir) / "tenants"
+    dir_to_clear = env.pageserver.tenant_dir()
    shutil.rmtree(dir_to_clear)
    os.mkdir(dir_to_clear)

--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -55,7 +55,7 @@ def test_basic_eviction(
    for sk in env.safekeepers:
        sk.stop()

-    timeline_path = env.timeline_dir(tenant_id, timeline_id)
+    timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
    initial_local_layers = sorted(
        list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
    )
@@ -243,7 +243,7 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
    assert by_kind["Image"] > 0
    assert by_kind["Delta"] > 0
    assert by_kind["InMemory"] == 0
-    resident_layers = list(env.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
+    resident_layers = list(env.pageserver.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
    log.info("resident layers count before eviction: %s", len(resident_layers))

    log.info("evict all layers")
@@ -251,7 +251,7 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):

    def ensure_resident_and_remote_size_metrics():
        log.info("ensure that all the layers are gone")
-        resident_layers = list(env.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
+        resident_layers = list(env.pageserver.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
        # we have disabled all background loops, so, this should hold
        assert len(resident_layers) == 0

--- a/Show More
+++ b/Show More