diff --git a/CODEOWNERS b/CODEOWNERS
index b8ca54bc7e..e384dc39f1 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -5,7 +5,7 @@
 /libs/remote_storage/ @neondatabase/storage
 /libs/safekeeper_api/ @neondatabase/safekeepers
 /libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
-/pageserver/ @neondatabase/compute @neondatabase/storage
+/pageserver/ @neondatabase/storage
 /pgxn/ @neondatabase/compute
 /proxy/ @neondatabase/proxy
 /safekeeper/ @neondatabase/safekeepers
diff --git a/Cargo.lock b/Cargo.lock
index 2055f001af..36e7069eb1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1780,18 +1780,9 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
 
 [[package]]
 name = "hermit-abi"
-version = "0.2.6"
+version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "hermit-abi"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
+checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
 
 [[package]]
 name = "hex"
@@ -2053,7 +2044,7 @@ version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
 dependencies = [
- "hermit-abi 0.3.1",
+ "hermit-abi",
  "libc",
  "windows-sys 0.48.0",
 ]
@@ -2070,7 +2061,7 @@ version = "0.4.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
 dependencies = [
- "hermit-abi 0.3.1",
+ "hermit-abi",
  "io-lifetimes",
  "rustix 0.37.19",
  "windows-sys 0.48.0",
@@ -2444,11 +2435,11 @@ dependencies = [
 
 [[package]]
 name = "num_cpus"
-version = "1.15.0"
+version = "1.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b"
+checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
 dependencies = [
- "hermit-abi 0.2.6",
+ "hermit-abi",
  "libc",
 ]
 
@@ -3246,6 +3237,7 @@ dependencies = [
  "reqwest-tracing",
  "routerify",
  "rstest",
+ "rustc-hash",
  "rustls",
  "rustls-pemfile",
  "scopeguard",
@@ -3417,6 +3409,7 @@ dependencies = [
  "metrics",
  "once_cell",
  "pin-project-lite",
+ "rand",
  "scopeguard",
  "serde",
  "serde_json",
diff --git a/Cargo.toml b/Cargo.toml
index 4fe3069822..b0bcf69039 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -107,6 +107,7 @@ reqwest-middleware = "0.2.0"
 reqwest-retry = "0.2.2"
 routerify = "3"
 rpds = "0.13"
+rustc-hash = "1.1.0"
 rustls = "0.21"
 rustls-pemfile = "1"
 rustls-split = "0.3"
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 55eb9b7411..7e34b66d68 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -615,11 +615,7 @@ RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgre
 #########################################################################################
 #
 # Layer "rust extensions"
-# This layer is used to build `pgx` deps
-#
-# FIXME: This needs to be updated to latest version of 'pgrx' (it was renamed from
-# 'pgx' to 'pgrx') for PostgreSQL 16. And that in turn requires bumping the pgx
-# dependency on all the rust extension that depend on it, too.
+# This layer is used to build `pgrx` deps
 #
 #########################################################################################
 FROM build-deps AS rust-extensions-build
@@ -635,22 +631,12 @@ USER nonroot
 WORKDIR /home/nonroot
 ARG PG_VERSION
 
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version ${PG_VERSION}" && exit 1 \
-        ;; \
-    esac && \
-    curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
+RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
     chmod +x rustup-init && \
     ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
     rm rustup-init && \
-    cargo install --locked --version 0.7.3 cargo-pgx && \
-    /bin/bash -c 'cargo pgx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
+    cargo install --locked --version 0.10.2 cargo-pgrx && \
+    /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
 
 USER root
 
@@ -664,23 +650,11 @@ USER root
 FROM rust-extensions-build AS pg-jsonschema-pg-build
 ARG PG_VERSION
 
-# caeab60d70b2fd3ae421ec66466a3abbb37b7ee6 made on 06/03/2023
-# there is no release tag yet, but we need it due to the superuser fix in the control file, switch to git tag after release >= 0.1.5
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version \"${PG_VERSION}\"" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \
-    echo "54129ce2e7ee7a585648dbb4cef6d73f795d94fe72f248ac01119992518469a4 pg_jsonschema.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \
+    echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \
     mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgx = "0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    cargo pgx install --release && \
+    sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
 
 #########################################################################################
@@ -693,26 +667,11 @@ RUN case "${PG_VERSION}" in \
 FROM rust-extensions-build AS pg-graphql-pg-build
 ARG PG_VERSION
 
-# b4988843647450a153439be367168ed09971af85 made on 22/02/2023 (from remove-pgx-contrib-spiext branch)
-# Currently pgx version bump to >= 0.7.2  causes "call to unsafe function" compliation errors in
-# pgx-contrib-spiext. There is a branch that removes that dependency, so use it. It is on the
-# same 1.1 version we've used before.
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \
-    echo "0c7b0e746441b2ec24187d0e03555faf935c2159e2839bddd14df6dafbc8c9bd pg_graphql.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \
+    echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \
     mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgx = "~0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    sed -i 's/pgx-tests = "~0.7.1"/pgx-tests = "0.7.3"/g' Cargo.toml && \
-    cargo pgx install --release && \
+    sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgrx install --release && \
     # it's needed to enable extension because it uses untrusted C language
     sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control
@@ -727,21 +686,11 @@ RUN case "${PG_VERSION}" in \
 FROM rust-extensions-build AS pg-tiktoken-pg-build
 ARG PG_VERSION
 
-# 801f84f08c6881c8aa30f405fafbf00eec386a72 made on 10/03/2023
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \
-    echo "52f60ac800993a49aa8c609961842b611b6b1949717b69ce2ec9117117e16e4a pg_tiktoken.tar.gz" | sha256sum --check && \
+# 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
+RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
+    echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
     mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
-    cargo pgx install --release && \
+    cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
 
 #########################################################################################
@@ -754,21 +703,15 @@ RUN case "${PG_VERSION}" in \
 FROM rust-extensions-build AS pg-pgx-ulid-build
 ARG PG_VERSION
 
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
-    echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \
+    echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \
     mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgx        = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    cargo pgx install --release && \
+    echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \
+    wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
+    patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
+    echo "********************************************************************************************************" && \
+    sed -i 's/pgrx       = "=0.10.2"/pgrx = { version = "=0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
 
 #########################################################################################
diff --git a/control_plane/src/bin/attachment_service.rs b/control_plane/src/bin/attachment_service.rs
index e879646b63..d4bca59c7b 100644
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -223,6 +223,7 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
     if attach_req.pageserver_id.is_some() {
         tenant_state.generation += 1;
     }
+    tenant_state.pageserver = attach_req.pageserver_id;
     let generation = tenant_state.generation;
 
     locked.save().await.map_err(ApiError::InternalServerError)?;
diff --git a/deny.toml b/deny.toml
index 55c581ce3a..f4ea0d4dac 100644
--- a/deny.toml
+++ b/deny.toml
@@ -23,7 +23,7 @@ vulnerability = "deny"
 unmaintained = "warn"
 yanked = "warn"
 notice = "warn"
-ignore = ["RUSTSEC-2023-0052"]
+ignore = []
 
 # This section is considered when running `cargo deny check licenses`
 # More documentation for the licenses section can be found here:
diff --git a/docs/rfcs/028-pageserver-migration.md b/docs/rfcs/028-pageserver-migration.md
new file mode 100644
index 0000000000..f708f641aa
--- /dev/null
+++ b/docs/rfcs/028-pageserver-migration.md
@@ -0,0 +1,599 @@
+# Seamless tenant migration
+
+- Author: john@neon.tech
+- Created on 2023-08-11
+- Implemented on ..
+
+## Summary
+
+The preceding [generation numbers RFC](025-generation-numbers.md) may be thought of as "making tenant
+migration safe". Following that,
+this RFC is about how those migrations are to be done:
+
+1. Seamlessly (without interruption to client availability)
+2. Quickly (enabling faster operations)
+3. Efficiently (minimizing I/O and $ cost)
+
+These points are in priority order: if we have to sacrifice
+efficiency to make a migration seamless for clients, we will
+do so, etc.
+
+This is accomplished by introducing two high level changes:
+
+- A dual-attached state for tenants, used in a control-plane-orchestrated
+  migration procedure that preserves availability during a migration.
+- Warm secondary locations for tenants, where on-disk content is primed
+  for a fast migration of the tenant from its current attachment to this
+  secondary location.
+
+## Motivation
+
+Migrating tenants between pageservers is essential to operating a service
+at scale, in several contexts:
+
+1. Responding to a pageserver node failure by migrating tenants to other pageservers
+2. Balancing load and capacity across pageservers, for example when a user expands their
+   database and they need to migrate to a pageserver with more capacity.
+3. Restarting pageservers for upgrades and maintenance
+
+The current situation steps for migration are:
+
+- detach from old node; skip if old node is dead; (the [skip part is still WIP](https://github.com/neondatabase/cloud/issues/5426)).
+- attach to new node
+- re-configure endpoints to use the new node
+
+Once [generation numbers](025-generation-numbers.md) are implemented,
+the detach step is no longer critical for correctness. So, we can
+
+- attach to a new node,
+- re-configure endpoints to use the new node, and then
+- detach from the old node.
+
+However, this still does not meet our seamless/fast/efficient goals:
+
+- Not fast: The new node will have to download potentially large amounts
+  of data from S3, which may take many minutes.
+- Not seamless: If we attach to a new pageserver before detaching an old one,
+  the new one might delete some objects that interrupt availability of reads on the old one.
+- Not efficient: the old pageserver will continue uploading
+  S3 content during the migration that will never be read.
+
+The user expectations for availability are:
+
+- For planned maintenance, there should be zero availability
+  gap. This expectation is fulfilled by this RFC.
+- For unplanned changes (e.g. node failures), there should be
+  minimal availability gap. This RFC provides the _mechanism_
+  to fail over quickly, but does not provide the failure _detection_
+  nor failover _policy_.
+
+## Non Goals
+
+- Defining service tiers with different storage strategies: the same
+  level of HA & overhead will apply to all tenants. This doesn't rule out
+  adding such tiers in future.
+- Enabling pageserver failover in the absence of a control plane: the control
+  plane will remain the source of truth for what should be attached where.
+- Totally avoiding availability gaps on unplanned migrations during
+  a failure (we expect a small, bounded window of
+  read unavailability of very recent LSNs)
+- Workload balancing: this RFC defines the mechanism for moving tenants
+  around, not the higher level logic for deciding who goes where.
+- Defining all possible configuration flows for tenants: the migration process
+  defined in this RFC demonstrates the sufficiency of the pageserver API, but
+  is not the only kind of configuration change the control plane will ever do.
+  The APIs defined here should let the control plane move tenants around in
+  whatever way is needed while preserving data safety and read availability.
+
+## Impacted components
+
+Pageserver, control plane
+
+## Terminology
+
+- **Attachment**: a tenant is _attached_ to a pageserver if it has
+  been issued a generation number, and is running an instance of
+  the `Tenant` type, ingesting the WAL, and available to serve
+  page reads.
+- **Location**: locations are a superset of attachments. A location
+  is a combination of a tenant and a pageserver. We may _attach_ at a _location_.
+
+- **Secondary location**: a location which is not currently attached.
+- **Warm secondary location**: a location which is not currently attached, but is endeavoring to maintain a warm local cache of layers. We avoid calling this a _warm standby_ to avoid confusion with similar postgres features.
+
+## Implementation (high level)
+
+### Warm secondary locations
+
+To enable faster migrations, we will identify at least one _secondary location_
+for each tenant. This secondary location will keep a warm cache of layers
+for the tenant, so that if it is later attached, it can catch up with the
+latest LSN quickly: rather than downloading everything, it only has to replay
+the recent part of the WAL to advance from the remote_consistent_offset to the
+most recent LSN in the WAL.
+
+The control plane is responsible for selecting secondary locations, and
+calling into pageservers to configure tenants into a secondary mode at this
+new location, as well as attaching the tenant in its existing primary location.
+
+The attached pageserver for a tenant will publish a [layer heatmap](#layer-heatmap)
+to advise secondaries of which layers should be downloaded.
+
+### Location modes
+
+Currently, we consider a tenant to be in one of two states on a pageserver:
+
+- Attached: active `Tenant` object, and layers on local disk
+- Detached: no layers on local disk, no runtime state.
+
+We will extend this with finer-grained modes, whose purpose will become
+clear in later sections:
+
+- **AttachedSingle**: equivalent the existing attached state.
+- **AttachedMulti**: like AttachedSingle, holds an up to date generation, but
+  does not do deletions.
+- **AttachedStale**: like AttachedSingle, holds a stale generation,
+  do not do any remote storage operations.
+- **Secondary**: keep local state on disk, periodically update from S3.
+- **Detached**: equivalent to existing detached state.
+
+To control these finer grained states, a new pageserver API endpoint will be added.
+
+### Cutover procedure
+
+Define old location and new location as "Node A" and "Node B". Consider
+the case where both nodes are available, and Node B was previously configured
+as a secondary location for the tenant we are migrating.
+
+The cutover procedure is orchestrated by the control plane, calling into
+the pageservers' APIs:
+
+1. Call to Node A requesting it to flush to S3 and enter AttachedStale state
+2. Increment generation, and call to Node B requesting it to enter AttachedMulti
+   state with the new generation.
+3. Call to Node B, requesting it to download the latest hot layers from remote storage,
+   according to the latest heatmap flushed by Node A.
+4. Wait for Node B's WAL ingestion to catch up with node A's
+5. Update endpoints to use node B instead of node A
+6. Call to node B requesting it to enter state AttachedSingle.
+7. Call to node A requesting it to enter state Secondary
+
+The following table summarizes how the state of the system advances:
+
+|     Step      |     Node A     |     Node B     | Node used by endpoints |
+| :-----------: | :------------: | :------------: | :--------------------: |
+| 1 (_initial_) | AttachedSingle |   Secondary    |           A            |
+|       2       | AttachedStale  | AttachedMulti  |           A            |
+|       3       | AttachedStale  | AttachedMulti  |           A            |
+|       4       | AttachedStale  | AttachedMulti  |           A            |
+| 5 (_cutover_) | AttachedStale  | AttachedMulti  |           B            |
+|       6       | AttachedStale  | AttachedSingle |           B            |
+|  7 (_final_)  |   Secondary    | AttachedSingle |           B            |
+
+The procedure described for a clean handover from a live node to a secondary
+is also used for failure cases and for migrations to a location that is not
+configured as a secondary, by simply skipping irrelevant steps, as described in
+the following sections.
+
+#### Migration from an unresponsive node
+
+If node A is unavailable, then all calls into
+node A are skipped and we don't wait for B to catch up before
+switching updating the endpoints to use B.
+
+#### Migration to a location that is not a secondary
+
+If node B is initially in Detached state, the procedure is identical. Since Node B
+is coming from a Detached state rather than Secondary, the download of layers and
+catch up with WAL will take much longer.
+
+We might do this if:
+
+- Attached and secondary locations are both critically low on disk, and we need
+  to migrate to a third node with more resources available.
+- We are migrating a tenant which does not use secondary locations to save on cost.
+
+#### Permanent migration away from a node
+
+In the final step of the migration, we generally request the original node to enter a Secondary
+state. This is typical if we are doing a planned migration during maintenance, or to
+balance CPU/network load away from a node.
+
+One might also want to permanently migrate away: this can be done by simply removing the secondary
+location after the migration is complete, or as an optimization by substituting the Detached state
+for the Secondary state in the final step.
+
+#### Cutover diagram
+
+```mermaid
+sequenceDiagram
+participant CP as Control plane
+participant A as Node A
+participant B as Node B
+participant E as Endpoint
+
+CP->>A: PUT Flush & go to AttachedStale
+note right of A: A continues to ingest WAL
+CP->>B: PUT AttachedMulti
+CP->>B: PUT Download layers from latest heatmap
+note right of B: B downloads from S3
+loop Poll until download complete
+CP->>B: GET download status
+end
+activate B
+note right of B: B ingests WAL
+loop Poll until catch up
+CP->>B: GET visible WAL
+CP->>A: GET visible WAL
+end
+deactivate B
+CP->>E: Configure to use Node B
+E->>B: Connect for reads
+CP->>B: PUT AttachedSingle
+CP->>A: PUT Secondary
+```
+
+#### Cutover from an unavailable pageserver
+
+This case is far simpler: we may skip straight to our intended
+end state.
+
+```mermaid
+sequenceDiagram
+participant A as Node A
+participant CP as Control plane
+participant B as Node B
+participant E as Endpoint
+
+note right of A: Node A offline
+activate A
+CP->>B: PUT AttachedSingle
+CP->>E: Configure to use Node B
+E->>B: Connect for reads
+deactivate A
+```
+
+## Implementation (detail)
+
+### Purpose of AttachedMulti, AttachedStale
+
+#### AttachedMulti
+
+Ordinarily, an attached pageserver whose generation is the latest may delete
+layers at will (e.g. during compaction). If a previous generation pageserver
+is also still attached, and in use by endpoints, then this layer deletion could
+lead to a loss of availability for the endpoint when reading from the previous
+generation pageserver.
+
+The _AttachedMulti_ state simply disables deletions. These will be enqueued
+in `RemoteTimelineClient` until the control plane transitions the
+node into AttachedSingle, which unblocks deletions.  Other remote storage operations
+such as uploads are not blocked.
+
+AttachedMulti is not required for data safety, only to preserve availability
+on pageservers running with stale generations.
+
+A node enters AttachedMulti only when explicitly asked to by the control plane. It should
+only remain in this state for the duration of a migration.
+
+If a control plane bug leaves
+the node in AttachedMulti for a long time, then we must avoid unbounded memory use from enqueued
+deletions. This may be accomplished simply, by dropping enqueued deletions when some modest
+threshold of delayed deletions (e.g. 10k layers per tenant) is reached. As with all deletions,
+it is safe to skip them, and the leaked objects will be eventually cleaned up by scrub or
+by timeline deletion.
+
+During AttachedMulti, the Tenant is free to drop layers from local disk in response to
+disk pressure: only the deletion of remote layers is blocked.
+
+#### AttachedStale
+
+Currently, a pageserver with a stale generation number will continue to
+upload layers, but be prevented from completing deletions. This is safe, but inefficient: layers uploaded by this stale generation
+will not be read back by future generations of pageservers.
+
+The _AttachedStale_ state disables S3 uploads. The stale pageserver
+will continue to ingest the WAL and write layers to local disk, but not to
+do any uploads to S3.
+
+A node may enter AttachedStale in two ways:
+
+- Explicitly, when control plane calls into the node at the start of a migration.
+- Implicitly, when the node tries to validate some deletions and discovers
+  that its generation is stale.
+
+The AttachedStale state also disables sending consumption metrics from
+that location: it is interpreted as an indication that some other pageserver
+is already attached or is about to be attached, and that new pageserver will
+be responsible for sending consumption metrics.
+
+#### Disk Pressure & AttachedStale
+
+Over long periods of time, a tenant location in AttachedStale will accumulate data
+on local disk, as it cannot evict any layers written since it entered the
+AttachStale state. We rely on the control plane to revert the location to
+Secondary or Detached at the end of a migration.
+
+This scenario is particularly noteworthy when evacuating all tenants on a pageserver:
+since _all_ the attached tenants will go into AttachedStale, we will be doing no
+uploads at all, therefore ingested data will cause disk usage to increase continuously.
+Under nominal conditions, the available disk space on pageservers should be sufficient
+to complete the evacuation before this becomes a problem, but we must also handle
+the case where we hit a low disk situation while in this state.
+
+The concept of disk pressure already exists in the pageserver: the `disk_usage_eviction_task`
+touches each Tenant when it determines that a low-disk condition requires
+some layer eviction. Having selected layers for eviction, the eviction
+task calls `Timeline::evict_layers`.
+
+**Safety**: If evict_layers is called while in AttachedStale state, and some of the to-be-evicted
+layers are not yet uploaded to S3, then the block on uploads will be lifted. This
+will result in leaking some objects once a migration is complete, but will enable
+the node to manage its disk space properly: if a node is left with some tenants
+in AttachedStale indefinitely due to a network partition or control plane bug,
+these tenants will not cause a full disk condition.
+
+### Warm secondary updates
+
+#### Layer heatmap
+
+The secondary location's job is to serve reads **with the same quality of service as the original location
+was serving them around the time of a migration**. This does not mean the secondary
+location needs the whole set of layers: inactive layers that might soon
+be evicted on the attached pageserver need not be downloaded by the
+secondary. A totally idle tenant only needs to maintain enough on-disk
+state to enable a fast cold start (i.e. the most recent image layers are
+typically sufficient).
+
+To enable this, we introduce the concept of a _layer heatmap_, which
+acts as an advisory input to secondary locations to decide which
+layers to download from S3.
+
+#### Attached pageserver
+
+The attached pageserver, if in state AttachedSingle, periodically
+uploads a serialized heat map to S3. It may skip this if there
+is no change since the last time it uploaded (e.g. if the tenant
+is totally idle).
+
+Additionally, when the tenant is flushed to remote storage prior to a migration
+(the first step in [cutover procedure](#cutover-procedure)), 
+the heatmap is written out. This enables a future attached pageserver
+to get an up to date view when deciding which layers to download.
+
+#### Secondary location behavior
+
+Secondary warm locations run a simple loop, implemented separately from
+the main `Tenant` type, which represents attached tenants:
+
+- Download the layer heatmap
+- Select any "hot enough" layers to download, if there is sufficient
+  free disk space.
+- Download layers, if they were not previously evicted (see below)
+- Download the latest index_part.json
+- Check if any layers currently on disk are no longer referenced by
+  IndexPart & delete them
+
+Note that the heatmap is only advisory: if a secondary location has plenty
+of disk space, it may choose to retain layers that aren't referenced
+by the heatmap, as long as they are still referenced by the IndexPart. Conversely,
+if a node is very low on disk space, it might opt to raise the heat threshold required
+to both downloading a layer, until more disk space is available.
+
+#### Secondary locations & disk pressure
+
+Secondary locations are subject to eviction on disk pressure, just as
+attached locations are.  For eviction purposes, the access time of a
+layer in a secondary location will be the access time given in the heatmap,
+rather than the literal time at which the local layer file was accessed.
+
+The heatmap will indicate which layers are in local storage on the attached
+location.  The secondary will always attempt to get back to having that
+set of layers on disk, but to avoid flapping, it will remember the access
+time of the layer it was most recently asked to evict, and layers whose
+access time is below that will not be re-downloaded.
+
+The resulting behavior is that after a layer is evicted from a secondary
+location, it is only re-downloaded once the attached pageserver accesses
+the layer and uploads a heatmap reflecting that access time.  On a pageserver
+restart, the secondary location will attempt to download all layers in
+the heatmap again, if they are not on local disk.
+
+This behavior will be slightly different when secondary locations are
+used for "low energy tenants", but that is beyond the scope of this RFC.
+
+### Location configuration API
+
+Currently, the `/tenant/<tenant_id>/config` API defines various
+tunables like compaction settings, which apply to the tenant irrespective
+of which pageserver it is running on.
+
+A new "location config" structure will be introduced, which defines
+configuration which is per-tenant, but local to a particular pageserver,
+such as the attachment mode and whether it is a secondary.
+
+The pageserver will expose a new per-tenant API for setting
+the state: `/tenant/<tenant_id>/location/config`.
+
+Body content:
+
+```
+{
+  state: 'enum{Detached, Secondary, AttachedSingle, AttachedMulti, AttachedStale}',
+  generation: Option<u32>,
+  configuration: `Option<TenantConfig>`
+  flush: bool
+}
+```
+
+Existing `/attach` and `/detach` endpoint will have the same
+behavior as calling `/location/config` with `AttachedSingle` and `Detached`
+states respectively. These endpoints will be deprecated and later
+removed.
+
+The generation attribute is mandatory for entering `AttachedSingle` or
+`AttachedMulti`.
+
+The configuration attribute is mandatory when entering any state other
+than `Detached`. This configuration is the same as the body for
+the existing `/tenant/<tenant_id>/config` endpoint.
+
+The `flush` argument indicates whether the pageservers should flush
+to S3 before proceeding: this only has any effect if the node is
+currently in AttachedSingle or AttachedMulti. This is used
+during the first phase of migration, when transitioning the
+old pageserver to AttachedSingle.
+
+The `/re-attach` API response will be extended to include a `state` as
+well as a `generation`, enabling the pageserver to enter the
+correct state for each tenant on startup.
+
+### Database schema for locations
+
+A new table `ProjectLocation`:
+
+- pageserver_id: int
+- tenant_id: TenantId
+- generation: Option<int>
+- state: `enum(Secondary, AttachedSingle, AttachedMulti)`
+
+Notes:
+
+- It is legacy for a Project to have zero `ProjectLocation`s
+- The `pageserver` column in `Project` now means "to which pageserver should
+  endpoints connect", rather than simply which pageserver is attached.
+- The `generation` column in `Project` remains, and is incremented and used
+  to set the generation of `ProjectLocation` rows when they are set into
+  an attached state.
+- The `Detached` state is implicitly represented as the absence of
+  a `ProjectLocation`.
+
+### Executing migrations
+
+Migrations will be implemented as Go functions, within the
+existing `Operation` framework in the control plane. These
+operations are persistent, such that they will always keep
+trying until completion: this property is important to avoid
+leaving garbage behind on pageservers, such as AttachedStale
+locations.
+
+### Recovery from failures during migration
+
+During migration, the control plane may encounter failures of either
+the original or new pageserver, or both:
+
+- If the original fails, skip past waiting for the new pageserver
+  to catch up, and put it into AttachedSingle immediately.
+- If the new node fails, put the old pageserver into Secondary
+  and then back into AttachedSingle (this has the effect of
+  retaining on-disk state and granting it a fresh generation number).
+- If both nodes fail, keep trying until one of them is available
+  again.
+
+### Control plane -> Pageserver reconciliation
+
+A migration may be done while the old node is unavailable,
+in which case the old node may still be running in an AttachedStale
+state.
+
+In this case, it is undesirable to have the migration `Operation`
+stay alive until the old node eventually comes back online
+and can be cleaned up. To handle this, the control plane
+should run a background reconciliation process to compare
+a pageserver's attachments with the database, and clean up
+any that shouldn't be there any more.
+
+Note that there will be no work to do if the old node was really
+offline, as during startup it will call into `/re-attach` and
+be updated that way. The reconciliation will only be needed
+if the node was unavailable but still running.
+
+## Alternatives considered
+
+### Only enabling secondary locations for tenants on a higher service tier
+
+This will make sense in future, especially for tiny databases that may be
+downloaded from S3 in milliseconds when needed.
+
+However, it is not wise to do it immediately, because pageservers contain
+a mixture of higher and lower tier workloads. If we had 1 tenant with
+a secondary location and 9 without, then those other 9 tenants will do
+a lot of I/O as they try to recover from S3, which may degrade the
+service of the tenant which had a secondary location.
+
+Until we segregate tenant on different service tiers on different pageserver
+nodes, or implement & test QoS to ensure that tenants with secondaries are
+not harmed by tenants without, we should use the same failover approach
+for all the tenants.
+
+### Hot secondary locations (continuous WAL replay)
+
+Instead of secondary locations populating their caches from S3, we could
+have them consume the WAL from safekeepers. The downsides of this would be:
+
+- Double load on safekeepers, which are a less scalable service than S3
+- Secondary locations' on-disk state would end up subtly different to
+  the remote state, which would make synchronizing with S3 more complex/expensive
+  when going into attached state.
+
+The downside of only updating secondary locations from S3 is that we will
+have a delay during migration from replaying the LSN range between what's
+in S3 and what's in the pageserver. This range will be very small on
+planned migrations, as we have the old pageserver flush to S3 immediately
+before attaching the new pageserver. On unplanned migrations (old pageserver
+is unavailable), the range of LSNs to replay is bounded by the flush frequency
+on the old pageserver. However, the migration doesn't have to wait for the
+replay: it's just that not-yet-replayed LSNs will be unavailable for read
+until the new pageserver catches up.
+
+We expect that pageserver reads of the most recent LSNs will be relatively
+rare, as for an active endpoint those pages will usually still be in the postgres
+page cache: this leads us to prefer synchronizing from S3 on secondary
+locations, rather than consuming the WAL from safekeepers.
+
+### Cold secondary locations
+
+It is not functionally necessary to keep warm caches on secondary locations at all. However, if we do not, then
+we would experience a de-facto availability loss in unplanned migrations, as reads to the new node would take an extremely long time (many seconds, perhaps minutes).
+
+Warm caches on secondary locations are necessary to meet
+our availability goals.
+
+### Pageserver-granularity failover
+
+Instead of migrating tenants individually, we could have entire spare nodes,
+and on a node death, move all its work to one of these spares.
+
+This approach is avoided for several reasons:
+
+- we would still need fine-grained tenant migration for other
+  purposes such as balancing load
+- by sharing the spare capacity over many peers rather than one spare node,
+  these peers may use the capacity for other purposes, until it is needed
+  to handle migrated tenants. e.g. for keeping a deeper cache of their
+  attached tenants.
+
+### Readonly during migration
+
+We could simplify migrations by making both previous and new nodes go into a
+readonly state, then flush remote content from the previous node, then activate
+attachment on the secondary node.
+
+The downside to this approach is a potentially large gap in readability of
+recent LSNs while loading data onto the new node. To avoid this, it is worthwhile
+to incur the extra cost of double-replaying the WAL onto old and new nodes' local
+storage during a migration.
+
+### Peer-to-peer pageserver communication
+
+Rather than uploading the heatmap to S3, attached pageservers could make it
+available to peers.
+
+Currently, pageservers have no peer to peer communication, so adding this
+for heatmaps would incur significant overhead in deployment and configuration
+of the service, and ensuring that when a new pageserver is deployed, other
+pageservers are updated to be aware of it.
+
+As well as simplifying implementation, putting heatmaps in S3 will be useful
+for future analytics purposes -- gathering aggregated statistics on activity
+pattersn across many tenants may be done directly from data in S3.
diff --git a/libs/consumption_metrics/src/lib.rs b/libs/consumption_metrics/src/lib.rs
index 7b133c61af..9e89327e84 100644
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -107,7 +107,7 @@ pub const CHUNK_SIZE: usize = 1000;
 
 // Just a wrapper around a slice of events
 // to serialize it as `{"events" : [ ] }
-#[derive(serde::Serialize)]
+#[derive(serde::Serialize, serde::Deserialize)]
 pub struct EventChunk<'a, T: Clone> {
     pub events: std::borrow::Cow<'a, [T]>,
 }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index f354296be2..68620787bb 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -363,8 +363,15 @@ pub struct TimelineInfo {
     pub latest_gc_cutoff_lsn: Lsn,
     #[serde_as(as = "DisplayFromStr")]
     pub disk_consistent_lsn: Lsn,
+
+    /// The LSN that we have succesfully uploaded to remote storage
     #[serde_as(as = "DisplayFromStr")]
     pub remote_consistent_lsn: Lsn,
+
+    /// The LSN that we are advertizing to safekeepers
+    #[serde_as(as = "DisplayFromStr")]
+    pub remote_consistent_lsn_visible: Lsn,
+
     pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
     /// Sum of the size of all layer files.
     /// If a layer is present in both local FS and S3, it counts only once.
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index a4adae6146..2b808779f4 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -29,3 +29,4 @@ workspace_hack.workspace = true
 [dev-dependencies]
 tempfile.workspace = true
 test-context.workspace = true
+rand.workspace = true
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 1ddd156a08..a92b87632b 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -20,6 +20,7 @@ use std::{
 
 use anyhow::{bail, Context};
 
+use serde::{Deserialize, Serialize};
 use tokio::io;
 use toml_edit::Item;
 use tracing::info;
@@ -42,6 +43,9 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
 /// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
 
+/// As defined in S3 docs
+pub const MAX_KEYS_PER_DELETE: usize = 1000;
+
 const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
 
 /// Path on the remote storage, relative to some inner prefix.
@@ -50,6 +54,25 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct RemotePath(PathBuf);
 
+impl Serialize for RemotePath {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        serializer.collect_str(self)
+    }
+}
+
+impl<'de> Deserialize<'de> for RemotePath {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        let str = String::deserialize(deserializer)?;
+        Ok(Self(PathBuf::from(&str)))
+    }
+}
+
 impl std::fmt::Display for RemotePath {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "{}", self.0.display())
@@ -88,6 +111,10 @@ impl RemotePath {
     pub fn extension(&self) -> Option<&str> {
         self.0.extension()?.to_str()
     }
+
+    pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Path, std::path::StripPrefixError> {
+        self.0.strip_prefix(&p.0)
+    }
 }
 
 /// Storage (potentially remote) API to manage its state.
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 9262f1e88f..fc6d7fa61b 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -33,11 +33,10 @@ use tracing::debug;
 
 use super::StorageMetadata;
 use crate::{
-    Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Download, DownloadError, RemotePath, RemoteStorage, S3Config, MAX_KEYS_PER_DELETE,
+    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
-const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;
-
 pub(super) mod metrics;
 
 use self::metrics::{AttemptOutcome, RequestKind};
@@ -48,10 +47,47 @@ pub struct S3Bucket {
     bucket_name: String,
     prefix_in_bucket: Option<String>,
     max_keys_per_list_response: Option<i32>,
+    concurrency_limiter: ConcurrencyLimiter,
+}
+
+struct ConcurrencyLimiter {
     // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
     // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
     // The helps to ensure we don't exceed the thresholds.
-    concurrency_limiter: Arc<Semaphore>,
+    write: Arc<Semaphore>,
+    read: Arc<Semaphore>,
+}
+
+impl ConcurrencyLimiter {
+    fn for_kind(&self, kind: RequestKind) -> &Arc<Semaphore> {
+        match kind {
+            RequestKind::Get => &self.read,
+            RequestKind::Put => &self.write,
+            RequestKind::List => &self.read,
+            RequestKind::Delete => &self.write,
+        }
+    }
+
+    async fn acquire(
+        &self,
+        kind: RequestKind,
+    ) -> Result<tokio::sync::SemaphorePermit<'_>, tokio::sync::AcquireError> {
+        self.for_kind(kind).acquire().await
+    }
+
+    async fn acquire_owned(
+        &self,
+        kind: RequestKind,
+    ) -> Result<tokio::sync::OwnedSemaphorePermit, tokio::sync::AcquireError> {
+        Arc::clone(self.for_kind(kind)).acquire_owned().await
+    }
+
+    fn new(limit: usize) -> ConcurrencyLimiter {
+        Self {
+            read: Arc::new(Semaphore::new(limit)),
+            write: Arc::new(Semaphore::new(limit)),
+        }
+    }
 }
 
 #[derive(Default)]
@@ -118,7 +154,7 @@ impl S3Bucket {
             bucket_name: aws_config.bucket_name.clone(),
             max_keys_per_list_response: aws_config.max_keys_per_list_response,
             prefix_in_bucket,
-            concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())),
+            concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
         })
     }
 
@@ -157,7 +193,7 @@ impl S3Bucket {
         let started_at = start_counting_cancelled_wait(kind);
         let permit = self
             .concurrency_limiter
-            .acquire()
+            .acquire(kind)
             .await
             .expect("semaphore is never closed");
 
@@ -173,8 +209,7 @@ impl S3Bucket {
         let started_at = start_counting_cancelled_wait(kind);
         let permit = self
             .concurrency_limiter
-            .clone()
-            .acquire_owned()
+            .acquire_owned(kind)
             .await
             .expect("semaphore is never closed");
 
@@ -500,7 +535,7 @@ impl RemoteStorage for S3Bucket {
             delete_objects.push(obj_id);
         }
 
-        for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
+        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
             let started_at = start_measuring_requests(kind);
 
             let resp = self
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index 982c01a9be..b220349829 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -378,21 +378,30 @@ impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
 fn create_s3_client(
     max_keys_per_list_response: Option<i32>,
 ) -> anyhow::Result<Arc<GenericRemoteStorage>> {
+    use rand::Rng;
+
     let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
         .context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
     let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
         .context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?;
-    let random_prefix_part = std::time::SystemTime::now()
+
+    // due to how time works, we've had test runners use the same nanos as bucket prefixes.
+    // millis is just a debugging aid for easier finding the prefix later.
+    let millis = std::time::SystemTime::now()
         .duration_since(UNIX_EPOCH)
         .context("random s3 test prefix part calculation")?
-        .as_nanos();
+        .as_millis();
+
+    // because nanos can be the same for two threads so can millis, add randomness
+    let random = rand::thread_rng().gen::<u32>();
+
     let remote_storage_config = RemoteStorageConfig {
         max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
         max_sync_errors: NonZeroU32::new(5).unwrap(),
         storage: RemoteStorageKind::AwsS3(S3Config {
             bucket_name: remote_storage_s3_bucket,
             bucket_region: remote_storage_s3_region,
-            prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
+            prefix_in_bucket: Some(format!("test_{millis}_{random:08x}/")),
             endpoint: None,
             concurrency_limit: NonZeroUsize::new(100).unwrap(),
             max_keys_per_list_response,
diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs
index 163c8c0467..88d50905c6 100644
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -89,6 +89,22 @@ impl Generation {
             Self::Broken => panic!("Attempted to use a broken generation"),
         }
     }
+
+    pub fn next(&self) -> Generation {
+        match self {
+            Self::Valid(n) => Self::Valid(*n + 1),
+            Self::None => Self::Valid(1),
+            Self::Broken => panic!("Attempted to use a broken generation"),
+        }
+    }
+
+    pub fn into(self) -> Option<u32> {
+        if let Self::Valid(v) = self {
+            Some(v)
+        } else {
+            None
+        }
+    }
 }
 
 impl Serialize for Generation {
diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index 527e486fd0..dd54cd6ecd 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -24,6 +24,9 @@ pub enum ApiError {
     #[error("Precondition failed: {0}")]
     PreconditionFailed(Box<str>),
 
+    #[error("Shutting down")]
+    ShuttingDown,
+
     #[error(transparent)]
     InternalServerError(anyhow::Error),
 }
@@ -52,6 +55,10 @@ impl ApiError {
                 self.to_string(),
                 StatusCode::PRECONDITION_FAILED,
             ),
+            ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status(
+                "Shutting down".to_string(),
+                StatusCode::SERVICE_UNAVAILABLE,
+            ),
             ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
                 err.to_string(),
                 StatusCode::INTERNAL_SERVER_ERROR,
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index f69c0603d5..7f17970c4c 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -216,6 +216,24 @@ impl std::fmt::Debug for PrettyLocation<'_, '_> {
     }
 }
 
+/// When you will store a secret but want to make sure it won't
+/// be accidentally logged, wrap it in a SecretString, whose Debug
+/// implementation does not expose the contents.
+#[derive(Clone, Eq, PartialEq)]
+pub struct SecretString(String);
+
+impl SecretString {
+    pub fn get_contents(&self) -> &str {
+        self.0.as_str()
+    }
+}
+
+impl std::fmt::Debug for SecretString {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "[SECRET]")
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use metrics::{core::Opts, IntCounterVec};
diff --git a/libs/vm_monitor/src/cgroup.rs b/libs/vm_monitor/src/cgroup.rs
index 3254fa4501..15e972505e 100644
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -431,14 +431,14 @@ impl CgroupWatcher {
                             .context("failed to request upscale")?;
 
                         let memory_high =
-                            self.get_high_bytes().context("failed to get memory.high")?;
+                            self.get_memory_high_bytes().context("failed to get memory.high")?;
                         let new_high = memory_high + self.config.memory_high_increase_by_bytes;
                         info!(
                             current_high_bytes = memory_high,
                             new_high_bytes = new_high,
                             "updating memory.high"
                         );
-                        self.set_high_bytes(new_high)
+                        self.set_memory_high_bytes(new_high)
                             .context("failed to set memory.high")?;
                         last_memory_high_increase_at = Some(Instant::now());
                         continue;
@@ -556,14 +556,6 @@ impl CgroupWatcher {
     }
 }
 
-/// Represents a set of limits we apply to a cgroup to control memory usage.
-///
-/// Setting these values also affects the thresholds for receiving usage alerts.
-#[derive(Debug)]
-pub struct MemoryLimits {
-    pub high: u64,
-}
-
 // Methods for manipulating the actual cgroup
 impl CgroupWatcher {
     /// Get a handle on the freezer subsystem.
@@ -624,50 +616,29 @@ impl CgroupWatcher {
     }
 
     /// Set cgroup memory.high threshold.
-    pub fn set_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
+    pub fn set_memory_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
+        self.set_memory_high_internal(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64))
+    }
+
+    /// Set the cgroup's memory.high to 'max', disabling it.
+    pub fn unset_memory_high(&self) -> anyhow::Result<()> {
+        self.set_memory_high_internal(MaxValue::Max)
+    }
+
+    fn set_memory_high_internal(&self, value: MaxValue) -> anyhow::Result<()> {
         self.memory()
             .context("failed to get memory subsystem")?
             .set_mem(cgroups_rs::memory::SetMemory {
                 low: None,
-                high: Some(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64)),
+                high: Some(value),
                 min: None,
                 max: None,
             })
-            .context("failed to set memory.high")
-    }
-
-    /// Set cgroup memory.high and memory.max.
-    pub fn set_limits(&self, limits: &MemoryLimits) -> anyhow::Result<()> {
-        info!(limits.high, path = self.path(), "writing new memory limits",);
-        self.memory()
-            .context("failed to get memory subsystem while setting memory limits")?
-            .set_mem(cgroups_rs::memory::SetMemory {
-                min: None,
-                low: None,
-                high: Some(MaxValue::Value(
-                    u64::min(limits.high, i64::MAX as u64) as i64
-                )),
-                max: None,
-            })
-            .context("failed to set memory limits")
-    }
-
-    /// Given some amount of available memory, set the desired cgroup memory limits
-    pub fn set_memory_limits(&mut self, available_memory: u64) -> anyhow::Result<()> {
-        let new_high = self.config.calculate_memory_high_value(available_memory);
-        let limits = MemoryLimits { high: new_high };
-        info!(
-            path = self.path(),
-            memory = ?limits,
-            "setting cgroup memory",
-        );
-        self.set_limits(&limits)
-            .context("failed to set cgroup memory limits")?;
-        Ok(())
+            .map_err(anyhow::Error::from)
     }
 
     /// Get memory.high threshold.
-    pub fn get_high_bytes(&self) -> anyhow::Result<u64> {
+    pub fn get_memory_high_bytes(&self) -> anyhow::Result<u64> {
         let high = self
             .memory()
             .context("failed to get memory subsystem while getting memory statistics")?
diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs
index 376017d784..09863c8936 100644
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -16,7 +16,7 @@ use tokio::sync::mpsc;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn};
 
-use crate::cgroup::{CgroupWatcher, MemoryLimits, Sequenced};
+use crate::cgroup::{CgroupWatcher, Sequenced};
 use crate::dispatcher::Dispatcher;
 use crate::filecache::{FileCacheConfig, FileCacheState};
 use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
@@ -106,6 +106,51 @@ impl Runner {
             kill,
         };
 
+        // If we have both the cgroup and file cache integrations enabled, it's possible for
+        // temporary failures to result in cgroup throttling (from memory.high), that in turn makes
+        // it near-impossible to connect to the file cache (because it times out). Unfortunately,
+        // we *do* still want to determine the file cache size before setting the cgroup's
+        // memory.high, so it's not as simple as just swapping the order.
+        //
+        // Instead, the resolution here is that on vm-monitor startup (note: happens on each
+        // connection from autoscaler-agent, possibly multiple times per compute_ctl lifecycle), we
+        // temporarily unset memory.high, to allow any existing throttling to dissipate. It's a bit
+        // of a hacky solution, but helps with reliability.
+        if let Some(name) = &args.cgroup {
+            // Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
+            // now, and then set limits later.
+            info!("initializing cgroup");
+
+            let (cgroup, cgroup_event_stream) = CgroupWatcher::new(name.clone(), requesting_send)
+                .context("failed to create cgroup manager")?;
+
+            info!("temporarily unsetting memory.high");
+
+            // Temporarily un-set cgroup memory.high; see above.
+            cgroup
+                .unset_memory_high()
+                .context("failed to unset memory.high")?;
+
+            let cgroup = Arc::new(cgroup);
+
+            let cgroup_clone = Arc::clone(&cgroup);
+            spawn_with_cancel(
+                token.clone(),
+                |_| error!("cgroup watcher terminated"),
+                async move { cgroup_clone.watch(notified_recv, cgroup_event_stream).await },
+            );
+
+            state.cgroup = Some(cgroup);
+        } else {
+            // *NOTE*: We need to forget the sender so that its drop impl does not get ran.
+            // This allows us to poll it in `Monitor::run` regardless of whether we
+            // are managing a cgroup or not. If we don't forget it, all receives will
+            // immediately return an error because the sender is droped and it will
+            // claim all select! statements, effectively turning `Monitor::run` into
+            // `loop { fail to receive }`.
+            mem::forget(requesting_send);
+        }
+
         let mut file_cache_reserved_bytes = 0;
         let mem = get_total_system_memory();
 
@@ -119,7 +164,7 @@ impl Runner {
                 false => FileCacheConfig::default_in_memory(),
             };
 
-            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
+            let mut file_cache = FileCacheState::new(connstr, config, token)
                 .await
                 .context("failed to create file cache")?;
 
@@ -152,35 +197,15 @@ impl Runner {
             state.filecache = Some(file_cache);
         }
 
-        if let Some(name) = &args.cgroup {
-            let (mut cgroup, cgroup_event_stream) =
-                CgroupWatcher::new(name.clone(), requesting_send)
-                    .context("failed to create cgroup manager")?;
-
+        if let Some(cgroup) = &state.cgroup {
             let available = mem - file_cache_reserved_bytes;
+            let value = cgroup.config.calculate_memory_high_value(available);
+
+            info!(value, "setting memory.high");
 
             cgroup
-                .set_memory_limits(available)
-                .context("failed to set cgroup memory limits")?;
-
-            let cgroup = Arc::new(cgroup);
-
-            // Some might call this . . . cgroup v2
-            let cgroup_clone = Arc::clone(&cgroup);
-
-            spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move {
-                cgroup_clone.watch(notified_recv, cgroup_event_stream).await
-            });
-
-            state.cgroup = Some(cgroup);
-        } else {
-            // *NOTE*: We need to forget the sender so that its drop impl does not get ran.
-            // This allows us to poll it in `Monitor::run` regardless of whether we
-            // are managing a cgroup or not. If we don't forget it, all receives will
-            // immediately return an error because the sender is droped and it will
-            // claim all select! statements, effectively turning `Monitor::run` into
-            // `loop { fail to receive }`.
-            mem::forget(requesting_send);
+                .set_memory_high_bytes(value)
+                .context("failed to set cgroup memory.high")?;
         }
 
         Ok(state)
@@ -257,14 +282,11 @@ impl Runner {
                 new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
             }
 
-            let limits = MemoryLimits {
-                // new_cgroup_mem_high is initialized to 0 but it is guarancontextd to not be here
-                // since it is properly initialized in the previous cgroup if let block
-                high: new_cgroup_mem_high,
-            };
+            // new_cgroup_mem_high is initialized to 0 but it is guaranteed to not be here
+            // since it is properly initialized in the previous cgroup if let block
             cgroup
-                .set_limits(&limits)
-                .context("failed to set cgroup memory limits")?;
+                .set_memory_high_bytes(new_cgroup_mem_high)
+                .context("failed to set cgroup memory.high")?;
 
             let message = format!(
                 "set cgroup memory.high to {} MiB, of new max {} MiB",
@@ -327,12 +349,9 @@ impl Runner {
                 name = cgroup.path(),
                 "updating cgroup memory.high",
             );
-            let limits = MemoryLimits {
-                high: new_cgroup_mem_high,
-            };
             cgroup
-                .set_limits(&limits)
-                .context("failed to set file cache size")?;
+                .set_memory_high_bytes(new_cgroup_mem_high)
+                .context("failed to set cgroup memory.high")?;
         }
 
         Ok(())
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index b6a2117f9c..d8a00b677b 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -8,6 +8,7 @@ use anyhow::{anyhow, Context};
 use clap::{Arg, ArgAction, Command};
 
 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
+use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
@@ -20,6 +21,7 @@ use metrics::set_build_info_metric;
 use pageserver::{
     config::{defaults::*, PageServerConf},
     context::{DownloadBehavior, RequestContext},
+    deletion_queue::DeletionQueue,
     http, page_cache, page_service, task_mgr,
     task_mgr::TaskKind,
     task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
@@ -346,9 +348,22 @@ fn start_pageserver(
         }
     };
 
+    // Top-level cancellation token for the process
+    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
+
     // Set up remote storage client
     let remote_storage = create_remote_storage_client(conf)?;
 
+    // Set up deletion queue
+    let (deletion_queue, deletion_workers) = DeletionQueue::new(
+        remote_storage.clone(),
+        ControlPlaneClient::new(conf, &shutdown_pageserver),
+        conf,
+    );
+    if let Some(deletion_workers) = deletion_workers {
+        deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
+    }
+
     // Up to this point no significant I/O has been done: this should have been fast.  Record
     // duration prior to starting I/O intensive phase of startup.
     startup_checkpoint("initial", "Starting loading tenants");
@@ -379,13 +394,13 @@ fn start_pageserver(
     };
 
     // Scan the local 'tenants/' directory and start loading the tenants
-    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
-
+    let deletion_queue_client = deletion_queue.new_client();
     BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
         conf,
         TenantSharedResources {
             broker_client: broker_client.clone(),
             remote_storage: remote_storage.clone(),
+            deletion_queue_client,
         },
         order,
         shutdown_pageserver.clone(),
@@ -481,9 +496,10 @@ fn start_pageserver(
             http::routes::State::new(
                 conf,
                 http_auth.clone(),
-                remote_storage,
+                remote_storage.clone(),
                 broker_client.clone(),
                 disk_usage_eviction_state,
+                deletion_queue.new_client(),
             )
             .context("Failed to initialize router state")?,
         );
@@ -611,7 +627,12 @@ fn start_pageserver(
             // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
             // The plan is to change that over time.
             shutdown_pageserver.take();
-            BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
+            let bg_remote_storage = remote_storage.clone();
+            let bg_deletion_queue = deletion_queue.clone();
+            BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
+                bg_remote_storage.map(|_| bg_deletion_queue),
+                0,
+            ));
             unreachable!()
         }
     })
@@ -623,7 +644,7 @@ fn create_remote_storage_client(
     let config = if let Some(config) = &conf.remote_storage_config {
         config
     } else {
-        // No remote storage configured.
+        tracing::warn!("no remote storage configured, this is a deprecated configuration");
         return Ok(None);
     };
 
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 8ee7f28c11..c3f2f14a74 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -11,6 +11,7 @@ use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
+use utils::logging::SecretString;
 
 use once_cell::sync::OnceCell;
 use reqwest::Url;
@@ -207,6 +208,9 @@ pub struct PageServerConf {
     pub background_task_maximum_delay: Duration,
 
     pub control_plane_api: Option<Url>,
+
+    /// JWT token for use with the control plane API.
+    pub control_plane_api_token: Option<SecretString>,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -283,6 +287,7 @@ struct PageServerConfigBuilder {
     background_task_maximum_delay: BuilderValue<Duration>,
 
     control_plane_api: BuilderValue<Option<Url>>,
+    control_plane_api_token: BuilderValue<Option<SecretString>>,
 }
 
 impl Default for PageServerConfigBuilder {
@@ -347,6 +352,7 @@ impl Default for PageServerConfigBuilder {
             .unwrap()),
 
             control_plane_api: Set(None),
+            control_plane_api_token: Set(None),
         }
     }
 }
@@ -475,8 +481,8 @@ impl PageServerConfigBuilder {
         self.background_task_maximum_delay = BuilderValue::Set(delay);
     }
 
-    pub fn control_plane_api(&mut self, api: Url) {
-        self.control_plane_api = BuilderValue::Set(Some(api))
+    pub fn control_plane_api(&mut self, api: Option<Url>) {
+        self.control_plane_api = BuilderValue::Set(api)
     }
 
     pub fn build(self) -> anyhow::Result<PageServerConf> {
@@ -567,6 +573,9 @@ impl PageServerConfigBuilder {
             control_plane_api: self
                 .control_plane_api
                 .ok_or(anyhow!("missing control_plane_api"))?,
+            control_plane_api_token: self
+                .control_plane_api_token
+                .ok_or(anyhow!("missing control_plane_api_token"))?,
         })
     }
 }
@@ -580,6 +589,27 @@ impl PageServerConf {
         self.workdir.join(TENANTS_SEGMENT_NAME)
     }
 
+    pub fn deletion_prefix(&self) -> PathBuf {
+        self.workdir.join("deletion")
+    }
+
+    pub fn deletion_list_path(&self, sequence: u64) -> PathBuf {
+        // Encode a version in the filename, so that if we ever switch away from JSON we can
+        // increment this.
+        const VERSION: u8 = 1;
+
+        self.deletion_prefix()
+            .join(format!("{sequence:016x}-{VERSION:02x}.list"))
+    }
+
+    pub fn deletion_header_path(&self) -> PathBuf {
+        // Encode a version in the filename, so that if we ever switch away from JSON we can
+        // increment this.
+        const VERSION: u8 = 1;
+
+        self.deletion_prefix().join(format!("header-{VERSION:02x}"))
+    }
+
     pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf {
         self.tenants_path().join(tenant_id.to_string())
     }
@@ -747,7 +777,14 @@ impl PageServerConf {
                 },
                 "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
                 "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
-                "control_plane_api" => builder.control_plane_api(parse_toml_string(key, item)?.parse().context("failed to parse control plane URL")?),
+                "control_plane_api" => {
+                    let parsed = parse_toml_string(key, item)?;
+                    if parsed.is_empty() {
+                        builder.control_plane_api(None)
+                    } else {
+                        builder.control_plane_api(Some(parsed.parse().context("failed to parse control plane URL")?))
+                    }
+                },
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -917,6 +954,7 @@ impl PageServerConf {
             ondemand_download_behavior_treat_error_as_warn: false,
             background_task_maximum_delay: Duration::ZERO,
             control_plane_api: None,
+            control_plane_api_token: None,
         }
     }
 }
@@ -1140,7 +1178,8 @@ background_task_maximum_delay = '334 s'
                 background_task_maximum_delay: humantime::parse_duration(
                     defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
                 )?,
-                control_plane_api: None
+                control_plane_api: None,
+                control_plane_api_token: None
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1196,7 +1235,8 @@ background_task_maximum_delay = '334 s'
                 test_remote_failures: 0,
                 ondemand_download_behavior_treat_error_as_warn: false,
                 background_task_maximum_delay: Duration::from_secs(334),
-                control_plane_api: None
+                control_plane_api: None,
+                control_plane_api_token: None
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index 192eb16789..3375392373 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -1,7 +1,9 @@
 use std::collections::HashMap;
 
-use hyper::StatusCode;
-use pageserver_api::control_api::{ReAttachRequest, ReAttachResponse};
+use pageserver_api::control_api::{
+    ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
+};
+use serde::{de::DeserializeOwned, Serialize};
 use tokio_util::sync::CancellationToken;
 use url::Url;
 use utils::{
@@ -12,25 +14,34 @@ use utils::{
 
 use crate::config::PageServerConf;
 
-// Backoffs when control plane requests do not succeed: compromise between reducing load
-// on control plane, and retrying frequently when we are blocked on a control plane
-// response to make progress.
-const BACKOFF_INCREMENT: f64 = 0.1;
-const BACKOFF_MAX: f64 = 10.0;
-
 /// The Pageserver's client for using the control plane API: this is a small subset
 /// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
-pub(crate) struct ControlPlaneClient {
+pub struct ControlPlaneClient {
     http_client: reqwest::Client,
     base_url: Url,
     node_id: NodeId,
     cancel: CancellationToken,
 }
 
+/// Represent operations which internally retry on all errors other than
+/// cancellation token firing: the only way they can fail is ShuttingDown.
+pub enum RetryForeverError {
+    ShuttingDown,
+}
+
+#[async_trait::async_trait]
+pub trait ControlPlaneGenerationsApi {
+    async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError>;
+    async fn validate(
+        &self,
+        tenants: Vec<(TenantId, Generation)>,
+    ) -> Result<HashMap<TenantId, bool>, RetryForeverError>;
+}
+
 impl ControlPlaneClient {
     /// A None return value indicates that the input `conf` object does not have control
     /// plane API enabled.
-    pub(crate) fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
+    pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
         let mut url = match conf.control_plane_api.as_ref() {
             Some(u) => u.clone(),
             None => return None,
@@ -42,39 +53,78 @@ impl ControlPlaneClient {
             segs.pop_if_empty().push("");
         }
 
-        let client = reqwest::ClientBuilder::new()
-            .build()
-            .expect("Failed to construct http client");
+        let mut client = reqwest::ClientBuilder::new();
+
+        if let Some(jwt) = &conf.control_plane_api_token {
+            let mut headers = hyper::HeaderMap::new();
+            headers.insert("Authorization", jwt.get_contents().parse().unwrap());
+            client = client.default_headers(headers);
+        }
 
         Some(Self {
-            http_client: client,
+            http_client: client.build().expect("Failed to construct HTTP client"),
             base_url: url,
             node_id: conf.id,
             cancel: cancel.clone(),
         })
     }
 
-    async fn try_re_attach(
+    async fn retry_http_forever<R, T>(
         &self,
-        url: Url,
-        request: &ReAttachRequest,
-    ) -> anyhow::Result<ReAttachResponse> {
-        match self.http_client.post(url).json(request).send().await {
-            Err(e) => Err(anyhow::Error::from(e)),
-            Ok(r) => {
-                if r.status() == StatusCode::OK {
-                    r.json::<ReAttachResponse>()
-                        .await
-                        .map_err(anyhow::Error::from)
-                } else {
-                    Err(anyhow::anyhow!("Unexpected status {}", r.status()))
-                }
+        url: &url::Url,
+        request: R,
+    ) -> Result<T, RetryForeverError>
+    where
+        R: Serialize,
+        T: DeserializeOwned,
+    {
+        #[derive(thiserror::Error, Debug)]
+        enum RemoteAttemptError {
+            #[error("shutdown")]
+            Shutdown,
+            #[error("remote: {0}")]
+            Remote(reqwest::Error),
+        }
+
+        match backoff::retry(
+            || async {
+                let response = self
+                    .http_client
+                    .post(url.clone())
+                    .json(&request)
+                    .send()
+                    .await
+                    .map_err(RemoteAttemptError::Remote)?;
+
+                response
+                    .error_for_status_ref()
+                    .map_err(RemoteAttemptError::Remote)?;
+                response
+                    .json::<T>()
+                    .await
+                    .map_err(RemoteAttemptError::Remote)
+            },
+            |_| false,
+            3,
+            u32::MAX,
+            "calling control plane generation validation API",
+            backoff::Cancel::new(self.cancel.clone(), || RemoteAttemptError::Shutdown),
+        )
+        .await
+        {
+            Err(RemoteAttemptError::Shutdown) => Err(RetryForeverError::ShuttingDown),
+            Err(RemoteAttemptError::Remote(_)) => {
+                panic!("We retry forever, this should never be reached");
             }
+            Ok(r) => Ok(r),
         }
     }
+}
 
-    /// Block until we get a successful response
-    pub(crate) async fn re_attach(&self) -> anyhow::Result<HashMap<TenantId, Generation>> {
+#[async_trait::async_trait]
+impl ControlPlaneGenerationsApi for ControlPlaneClient {
+    /// Block until we get a successful response, or error out if we are shut down
+    async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
         let re_attach_path = self
             .base_url
             .join("re-attach")
@@ -83,37 +133,47 @@ impl ControlPlaneClient {
             node_id: self.node_id,
         };
 
-        let mut attempt = 0;
-        loop {
-            let result = self.try_re_attach(re_attach_path.clone(), &request).await;
-            match result {
-                Ok(res) => {
-                    tracing::info!(
-                        "Received re-attach response with {} tenants",
-                        res.tenants.len()
-                    );
+        let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
+        tracing::info!(
+            "Received re-attach response with {} tenants",
+            response.tenants.len()
+        );
 
-                    return Ok(res
-                        .tenants
-                        .into_iter()
-                        .map(|t| (t.id, Generation::new(t.generation)))
-                        .collect::<HashMap<_, _>>());
-                }
-                Err(e) => {
-                    tracing::error!("Error re-attaching tenants, retrying: {e:#}");
-                    backoff::exponential_backoff(
-                        attempt,
-                        BACKOFF_INCREMENT,
-                        BACKOFF_MAX,
-                        &self.cancel,
-                    )
-                    .await;
-                    if self.cancel.is_cancelled() {
-                        return Err(anyhow::anyhow!("Shutting down"));
-                    }
-                    attempt += 1;
-                }
-            }
-        }
+        Ok(response
+            .tenants
+            .into_iter()
+            .map(|t| (t.id, Generation::new(t.generation)))
+            .collect::<HashMap<_, _>>())
+    }
+
+    /// Block until we get a successful response, or error out if we are shut down
+    async fn validate(
+        &self,
+        tenants: Vec<(TenantId, Generation)>,
+    ) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
+        let re_attach_path = self
+            .base_url
+            .join("validate")
+            .expect("Failed to build validate path");
+
+        let request = ValidateRequest {
+            tenants: tenants
+                .into_iter()
+                .map(|(id, gen)| ValidateRequestTenant {
+                    id,
+                    gen: gen
+                        .into()
+                        .expect("Generation should always be valid for a Tenant doing deletions"),
+                })
+                .collect(),
+        };
+
+        let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
+
+        Ok(response
+            .tenants
+            .into_iter()
+            .map(|rt| (rt.id, rt.valid))
+            .collect())
     }
 }
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
new file mode 100644
index 0000000000..4c0d399789
--- /dev/null
+++ b/pageserver/src/deletion_queue.rs
@@ -0,0 +1,1312 @@
+mod deleter;
+mod list_writer;
+mod validator;
+
+use std::collections::HashMap;
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::Duration;
+
+use crate::control_plane_client::ControlPlaneGenerationsApi;
+use crate::metrics;
+use crate::tenant::remote_timeline_client::remote_layer_path;
+use crate::tenant::remote_timeline_client::remote_timeline_path;
+use crate::virtual_file::VirtualFile;
+use anyhow::Context;
+use hex::FromHex;
+use remote_storage::{GenericRemoteStorage, RemotePath};
+use serde::Deserialize;
+use serde::Serialize;
+use serde_with::serde_as;
+use thiserror::Error;
+use tokio;
+use tokio_util::sync::CancellationToken;
+use tracing::Instrument;
+use tracing::{self, debug, error};
+use utils::crashsafe::path_with_suffix_extension;
+use utils::generation::Generation;
+use utils::id::{TenantId, TimelineId};
+use utils::lsn::AtomicLsn;
+use utils::lsn::Lsn;
+
+use self::deleter::Deleter;
+use self::list_writer::DeletionOp;
+use self::list_writer::ListWriter;
+use self::list_writer::RecoverOp;
+use self::validator::Validator;
+use deleter::DeleterMessage;
+use list_writer::ListWriterQueueMessage;
+use validator::ValidatorQueueMessage;
+
+use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName};
+
+// TODO: adminstrative "panic button" config property to disable all deletions
+// TODO: configurable for how long to wait before executing deletions
+
+/// We aggregate object deletions from many tenants in one place, for several reasons:
+/// - Coalesce deletions into fewer DeleteObjects calls
+/// - Enable Tenant/Timeline lifetimes to be shorter than the time it takes
+///   to flush any outstanding deletions.
+/// - Globally control throughput of deletions, as these are a low priority task: do
+///   not compete with the same S3 clients/connections used for higher priority uploads.
+/// - Enable gating deletions on validation of a tenant's generation number, to make
+///   it safe to multi-attach tenants (see docs/rfcs/025-generation-numbers.md)
+///
+/// There are two kinds of deletion: deferred and immediate.  A deferred deletion
+/// may be intentionally delayed to protect passive readers of S3 data, and is
+/// subject to a generation number validation step.  An immediate deletion is
+/// ready to execute immediately, and is only queued up so that it can be coalesced
+/// with other deletions in flight.
+///
+/// Deferred deletions pass through three steps:
+/// - ListWriter: accumulate deletion requests from Timelines, and batch them up into
+///   DeletionLists, which are persisted to disk.
+/// - Validator: accumulate deletion lists, and validate them en-masse prior to passing
+///   the keys in the list onward for actual deletion.  Also validate remote_consistent_lsn
+///   updates for running timelines.
+/// - Deleter: accumulate object keys that the validator has validated, and execute them in
+///   batches of 1000 keys via DeleteObjects.
+///
+/// Non-deferred deletions, such as during timeline deletion, bypass the first
+/// two stages and are passed straight into the Deleter.
+///
+/// Internally, each stage is joined by a channel to the next.  On disk, there is only
+/// one queue (of DeletionLists), which is written by the frontend and consumed
+/// by the backend.
+#[derive(Clone)]
+pub struct DeletionQueue {
+    client: DeletionQueueClient,
+
+    // Parent cancellation token for the tokens passed into background workers
+    cancel: CancellationToken,
+}
+
+/// Opaque wrapper around individual worker tasks, to avoid making the
+/// worker objects themselves public
+pub struct DeletionQueueWorkers<C>
+where
+    C: ControlPlaneGenerationsApi + Send + Sync,
+{
+    frontend: ListWriter,
+    backend: Validator<C>,
+    executor: Deleter,
+}
+
+impl<C> DeletionQueueWorkers<C>
+where
+    C: ControlPlaneGenerationsApi + Send + Sync + 'static,
+{
+    pub fn spawn_with(mut self, runtime: &tokio::runtime::Handle) -> tokio::task::JoinHandle<()> {
+        let jh_frontend = runtime.spawn(async move {
+            self.frontend
+                .background()
+                .instrument(tracing::info_span!(parent:None, "deletion frontend"))
+                .await
+        });
+        let jh_backend = runtime.spawn(async move {
+            self.backend
+                .background()
+                .instrument(tracing::info_span!(parent:None, "deletion backend"))
+                .await
+        });
+        let jh_executor = runtime.spawn(async move {
+            self.executor
+                .background()
+                .instrument(tracing::info_span!(parent:None, "deletion executor"))
+                .await
+        });
+
+        runtime.spawn({
+            async move {
+                jh_frontend.await.expect("error joining frontend worker");
+                jh_backend.await.expect("error joining backend worker");
+                drop(jh_executor.await.expect("error joining executor worker"));
+            }
+        })
+    }
+}
+
+/// A FlushOp is just a oneshot channel, where we send the transmit side down
+/// another channel, and the receive side will receive a message when the channel
+/// we're flushing has reached the FlushOp we sent into it.
+///
+/// The only extra behavior beyond the channel is that the notify() method does not
+/// return an error when the receive side has been dropped, because in this use case
+/// it is harmless (the code that initiated the flush no longer cares about the result).
+#[derive(Debug)]
+struct FlushOp {
+    tx: tokio::sync::oneshot::Sender<()>,
+}
+
+impl FlushOp {
+    fn new() -> (Self, tokio::sync::oneshot::Receiver<()>) {
+        let (tx, rx) = tokio::sync::oneshot::channel::<()>();
+        (Self { tx }, rx)
+    }
+
+    fn notify(self) {
+        if self.tx.send(()).is_err() {
+            // oneshot channel closed. This is legal: a client could be destroyed while waiting for a flush.
+            debug!("deletion queue flush from dropped client");
+        };
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct DeletionQueueClient {
+    tx: tokio::sync::mpsc::Sender<ListWriterQueueMessage>,
+    executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,
+
+    lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct TenantDeletionList {
+    /// For each Timeline, a list of key fragments to append to the timeline remote path
+    /// when reconstructing a full key
+    #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
+    timelines: HashMap<TimelineId, Vec<String>>,
+
+    /// The generation in which this deletion was emitted: note that this may not be the
+    /// same as the generation of any layers being deleted.  The generation of the layer
+    /// has already been absorbed into the keys in `objects`
+    generation: Generation,
+}
+
+impl TenantDeletionList {
+    pub(crate) fn len(&self) -> usize {
+        self.timelines.values().map(|v| v.len()).sum()
+    }
+}
+
+/// For HashMaps using a `hex` compatible key, where we would like to encode the key as a string
+fn to_hex_map<S, V, I>(input: &HashMap<I, V>, serializer: S) -> Result<S::Ok, S::Error>
+where
+    S: serde::Serializer,
+    V: Serialize,
+    I: AsRef<[u8]>,
+{
+    let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone()));
+
+    transformed
+        .collect::<HashMap<String, &V>>()
+        .serialize(serializer)
+}
+
+/// For HashMaps using a FromHex key, where we would like to decode the key
+fn from_hex_map<'de, D, V, I>(deserializer: D) -> Result<HashMap<I, V>, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+    V: Deserialize<'de>,
+    I: FromHex + std::hash::Hash + Eq,
+{
+    let hex_map = HashMap::<String, V>::deserialize(deserializer)?;
+    hex_map
+        .into_iter()
+        .map(|(k, v)| {
+            I::from_hex(k)
+                .map(|k| (k, v))
+                .map_err(|_| serde::de::Error::custom("Invalid hex ID"))
+        })
+        .collect()
+}
+
+/// Files ending with this suffix will be ignored and erased
+/// during recovery as startup.
+const TEMP_SUFFIX: &str = ".tmp";
+
+#[serde_as]
+#[derive(Debug, Serialize, Deserialize)]
+struct DeletionList {
+    /// Serialization version, for future use
+    version: u8,
+
+    /// Used for constructing a unique key for each deletion list we write out.
+    sequence: u64,
+
+    /// To avoid repeating tenant/timeline IDs in every key, we store keys in
+    /// nested HashMaps by TenantTimelineID.  Each Tenant only appears once
+    /// with one unique generation ID: if someone tries to push a second generation
+    /// ID for the same tenant, we will start a new DeletionList.
+    #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
+    tenants: HashMap<TenantId, TenantDeletionList>,
+
+    /// Avoid having to walk `tenants` to calculate the number of keys in
+    /// the nested deletion lists
+    size: usize,
+
+    /// Set to true when the list has undergone validation with the control
+    /// plane and the remaining contents of `tenants` are valid.  A list may
+    /// also be implicitly marked valid by DeletionHeader.validated_sequence
+    /// advancing to >= DeletionList.sequence
+    #[serde(default)]
+    #[serde(skip_serializing_if = "std::ops::Not::not")]
+    validated: bool,
+}
+
+#[serde_as]
+#[derive(Debug, Serialize, Deserialize)]
+struct DeletionHeader {
+    /// Serialization version, for future use
+    version: u8,
+
+    /// The highest sequence number (inclusive) that has been validated.  All deletion
+    /// lists on disk with a sequence <= this value are safe to execute.
+    validated_sequence: u64,
+}
+
+impl DeletionHeader {
+    const VERSION_LATEST: u8 = 1;
+
+    fn new(validated_sequence: u64) -> Self {
+        Self {
+            version: Self::VERSION_LATEST,
+            validated_sequence,
+        }
+    }
+
+    async fn save(&self, conf: &'static PageServerConf) -> anyhow::Result<()> {
+        debug!("Saving deletion list header {:?}", self);
+        let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?;
+        let header_path = conf.deletion_header_path();
+        let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
+        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
+            .await
+            .map_err(Into::into)
+    }
+}
+
+impl DeletionList {
+    const VERSION_LATEST: u8 = 1;
+    fn new(sequence: u64) -> Self {
+        Self {
+            version: Self::VERSION_LATEST,
+            sequence,
+            tenants: HashMap::new(),
+            size: 0,
+            validated: false,
+        }
+    }
+
+    fn is_empty(&self) -> bool {
+        self.tenants.is_empty()
+    }
+
+    fn len(&self) -> usize {
+        self.size
+    }
+
+    /// Returns true if the push was accepted, false if the caller must start a new
+    /// deletion list.
+    fn push(
+        &mut self,
+        tenant: &TenantId,
+        timeline: &TimelineId,
+        generation: Generation,
+        objects: &mut Vec<RemotePath>,
+    ) -> bool {
+        if objects.is_empty() {
+            // Avoid inserting an empty TimelineDeletionList: this preserves the property
+            // that if we have no keys, then self.objects is empty (used in Self::is_empty)
+            return true;
+        }
+
+        let tenant_entry = self
+            .tenants
+            .entry(*tenant)
+            .or_insert_with(|| TenantDeletionList {
+                timelines: HashMap::new(),
+                generation,
+            });
+
+        if tenant_entry.generation != generation {
+            // Only one generation per tenant per list: signal to
+            // caller to start a new list.
+            return false;
+        }
+
+        let timeline_entry = tenant_entry
+            .timelines
+            .entry(*timeline)
+            .or_insert_with(Vec::new);
+
+        let timeline_remote_path = remote_timeline_path(tenant, timeline);
+
+        self.size += objects.len();
+        timeline_entry.extend(objects.drain(..).map(|p| {
+            p.strip_prefix(&timeline_remote_path)
+                .expect("Timeline paths always start with the timeline prefix")
+                .to_string_lossy()
+                .to_string()
+        }));
+        true
+    }
+
+    fn into_remote_paths(self) -> Vec<RemotePath> {
+        let mut result = Vec::new();
+        for (tenant, tenant_deletions) in self.tenants.into_iter() {
+            for (timeline, timeline_layers) in tenant_deletions.timelines.into_iter() {
+                let timeline_remote_path = remote_timeline_path(&tenant, &timeline);
+                result.extend(
+                    timeline_layers
+                        .into_iter()
+                        .map(|l| timeline_remote_path.join(&PathBuf::from(l))),
+                );
+            }
+        }
+
+        result
+    }
+
+    async fn save(&self, conf: &'static PageServerConf) -> anyhow::Result<()> {
+        let path = conf.deletion_list_path(self.sequence);
+        let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX);
+
+        let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
+        VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
+            .await
+            .map_err(Into::into)
+    }
+}
+
+impl std::fmt::Display for DeletionList {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "DeletionList<seq={}, tenants={}, keys={}>",
+            self.sequence,
+            self.tenants.len(),
+            self.size
+        )
+    }
+}
+
+struct PendingLsn {
+    projected: Lsn,
+    result_slot: Arc<AtomicLsn>,
+}
+
+struct TenantLsnState {
+    timelines: HashMap<TimelineId, PendingLsn>,
+
+    // In what generation was the most recent update proposed?
+    generation: Generation,
+}
+
+#[derive(Default)]
+struct VisibleLsnUpdates {
+    tenants: HashMap<TenantId, TenantLsnState>,
+}
+
+impl VisibleLsnUpdates {
+    fn new() -> Self {
+        Self {
+            tenants: HashMap::new(),
+        }
+    }
+}
+
+impl std::fmt::Debug for VisibleLsnUpdates {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "VisibleLsnUpdates({} tenants)", self.tenants.len())
+    }
+}
+
+#[derive(Error, Debug)]
+pub enum DeletionQueueError {
+    #[error("Deletion queue unavailable during shutdown")]
+    ShuttingDown,
+}
+
+impl DeletionQueueClient {
+    pub(crate) fn broken() -> Self {
+        // Channels whose receivers are immediately dropped.
+        let (tx, _rx) = tokio::sync::mpsc::channel(1);
+        let (executor_tx, _executor_rx) = tokio::sync::mpsc::channel(1);
+        Self {
+            tx,
+            executor_tx,
+            lsn_table: Arc::default(),
+        }
+    }
+
+    /// This is cancel-safe.  If you drop the future before it completes, the message
+    /// is not pushed, although in the context of the deletion queue it doesn't matter: once
+    /// we decide to do a deletion the decision is always final.
+    async fn do_push<T>(
+        &self,
+        queue: &tokio::sync::mpsc::Sender<T>,
+        msg: T,
+    ) -> Result<(), DeletionQueueError> {
+        match queue.send(msg).await {
+            Ok(_) => Ok(()),
+            Err(e) => {
+                // This shouldn't happen, we should shut down all tenants before
+                // we shut down the global delete queue.  If we encounter a bug like this,
+                // we may leak objects as deletions won't be processed.
+                error!("Deletion queue closed while pushing, shutting down? ({e})");
+                Err(DeletionQueueError::ShuttingDown)
+            }
+        }
+    }
+
+    pub(crate) async fn recover(
+        &self,
+        attached_tenants: HashMap<TenantId, Generation>,
+    ) -> Result<(), DeletionQueueError> {
+        self.do_push(
+            &self.tx,
+            ListWriterQueueMessage::Recover(RecoverOp { attached_tenants }),
+        )
+        .await
+    }
+
+    /// When a Timeline wishes to update the remote_consistent_lsn that it exposes to the outside
+    /// world, it must validate its generation number before doing so.  Rather than do this synchronously,
+    /// we allow the timeline to publish updates at will via this API, and then read back what LSN was most
+    /// recently validated separately.
+    ///
+    /// In this function we publish the LSN to the `projected` field of the timeline's entry in the VisibleLsnUpdates.  The
+    /// backend will later wake up and notice that the tenant's generation requires validation.
+    pub(crate) async fn update_remote_consistent_lsn(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        current_generation: Generation,
+        lsn: Lsn,
+        result_slot: Arc<AtomicLsn>,
+    ) {
+        let mut locked = self
+            .lsn_table
+            .write()
+            .expect("Lock should never be poisoned");
+
+        let tenant_entry = locked.tenants.entry(tenant_id).or_insert(TenantLsnState {
+            timelines: HashMap::new(),
+            generation: current_generation,
+        });
+
+        if tenant_entry.generation != current_generation {
+            // Generation might have changed if we were detached and then re-attached: in this case,
+            // state from the previous generation cannot be trusted.
+            tenant_entry.timelines.clear();
+            tenant_entry.generation = current_generation;
+        }
+
+        tenant_entry.timelines.insert(
+            timeline_id,
+            PendingLsn {
+                projected: lsn,
+                result_slot,
+            },
+        );
+    }
+
+    /// Submit a list of layers for deletion: this function will return before the deletion is
+    /// persistent, but it may be executed at any time after this function enters: do not push
+    /// layers until you're sure they can be deleted safely (i.e. remote metadata no longer
+    /// references them).
+    ///
+    /// The `current_generation` is the generation of this pageserver's current attachment.  The
+    /// generations in `layers` are the generations in which those layers were written.
+    pub(crate) async fn push_layers(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        current_generation: Generation,
+        layers: Vec<(LayerFileName, Generation)>,
+    ) -> Result<(), DeletionQueueError> {
+        if current_generation.is_none() {
+            debug!("Enqueuing deletions in legacy mode, skipping queue");
+            let mut layer_paths = Vec::new();
+            for (layer, generation) in layers {
+                layer_paths.push(remote_layer_path(
+                    &tenant_id,
+                    &timeline_id,
+                    &layer,
+                    generation,
+                ));
+            }
+            self.push_immediate(layer_paths).await?;
+            return self.flush_immediate().await;
+        }
+
+        metrics::DELETION_QUEUE
+            .keys_submitted
+            .inc_by(layers.len() as u64);
+        self.do_push(
+            &self.tx,
+            ListWriterQueueMessage::Delete(DeletionOp {
+                tenant_id,
+                timeline_id,
+                layers,
+                generation: current_generation,
+                objects: Vec::new(),
+            }),
+        )
+        .await
+    }
+
+    /// This is cancel-safe.  If you drop the future the flush may still happen in the background.
+    async fn do_flush<T>(
+        &self,
+        queue: &tokio::sync::mpsc::Sender<T>,
+        msg: T,
+        rx: tokio::sync::oneshot::Receiver<()>,
+    ) -> Result<(), DeletionQueueError> {
+        self.do_push(queue, msg).await?;
+        if rx.await.is_err() {
+            // This shouldn't happen if tenants are shut down before deletion queue.  If we
+            // encounter a bug like this, then a flusher will incorrectly believe it has flushed
+            // when it hasn't, possibly leading to leaking objects.
+            error!("Deletion queue dropped flush op while client was still waiting");
+            Err(DeletionQueueError::ShuttingDown)
+        } else {
+            Ok(())
+        }
+    }
+
+    /// Wait until all previous deletions are persistent (either executed, or written to a DeletionList)
+    ///
+    /// This is cancel-safe.  If you drop the future the flush may still happen in the background.
+    pub async fn flush(&self) -> Result<(), DeletionQueueError> {
+        let (flush_op, rx) = FlushOp::new();
+        self.do_flush(&self.tx, ListWriterQueueMessage::Flush(flush_op), rx)
+            .await
+    }
+
+    // Wait until all previous deletions are executed
+    pub(crate) async fn flush_execute(&self) -> Result<(), DeletionQueueError> {
+        debug!("flush_execute: flushing to deletion lists...");
+        // Flush any buffered work to deletion lists
+        self.flush().await?;
+
+        // Flush the backend into the executor of deletion lists
+        let (flush_op, rx) = FlushOp::new();
+        debug!("flush_execute: flushing backend...");
+        self.do_flush(&self.tx, ListWriterQueueMessage::FlushExecute(flush_op), rx)
+            .await?;
+        debug!("flush_execute: finished flushing backend...");
+
+        // Flush any immediate-mode deletions (the above backend flush will only flush
+        // the executor if deletions had flowed through the backend)
+        debug!("flush_execute: flushing execution...");
+        let (flush_op, rx) = FlushOp::new();
+        self.do_flush(&self.executor_tx, DeleterMessage::Flush(flush_op), rx)
+            .await?;
+        debug!("flush_execute: finished flushing execution...");
+        Ok(())
+    }
+
+    /// This interface bypasses the persistent deletion queue, and any validation
+    /// that this pageserver is still elegible to execute the deletions.  It is for
+    /// use in timeline deletions, where the control plane is telling us we may
+    /// delete everything in the timeline.
+    ///
+    /// DO NOT USE THIS FROM GC OR COMPACTION CODE.  Use the regular `push_layers`.
+    pub(crate) async fn push_immediate(
+        &self,
+        objects: Vec<RemotePath>,
+    ) -> Result<(), DeletionQueueError> {
+        metrics::DELETION_QUEUE
+            .keys_submitted
+            .inc_by(objects.len() as u64);
+        self.executor_tx
+            .send(DeleterMessage::Delete(objects))
+            .await
+            .map_err(|_| DeletionQueueError::ShuttingDown)
+    }
+
+    /// Companion to push_immediate.  When this returns Ok, all prior objects sent
+    /// into push_immediate have been deleted from remote storage.
+    pub(crate) async fn flush_immediate(&self) -> Result<(), DeletionQueueError> {
+        let (flush_op, rx) = FlushOp::new();
+        self.executor_tx
+            .send(DeleterMessage::Flush(flush_op))
+            .await
+            .map_err(|_| DeletionQueueError::ShuttingDown)?;
+
+        rx.await.map_err(|_| DeletionQueueError::ShuttingDown)
+    }
+}
+
+impl DeletionQueue {
+    pub fn new_client(&self) -> DeletionQueueClient {
+        self.client.clone()
+    }
+
+    /// Caller may use the returned object to construct clients with new_client.
+    /// Caller should tokio::spawn the background() members of the two worker objects returned:
+    /// we don't spawn those inside new() so that the caller can use their runtime/spans of choice.
+    ///
+    /// If remote_storage is None, then the returned workers will also be None.
+    pub fn new<C>(
+        remote_storage: Option<GenericRemoteStorage>,
+        control_plane_client: Option<C>,
+        conf: &'static PageServerConf,
+    ) -> (Self, Option<DeletionQueueWorkers<C>>)
+    where
+        C: ControlPlaneGenerationsApi + Send + Sync,
+    {
+        // Deep channel: it consumes deletions from all timelines and we do not want to block them
+        let (tx, rx) = tokio::sync::mpsc::channel(16384);
+
+        // Shallow channel: it carries DeletionLists which each contain up to thousands of deletions
+        let (backend_tx, backend_rx) = tokio::sync::mpsc::channel(16);
+
+        // Shallow channel: it carries lists of paths, and we expect the main queueing to
+        // happen in the backend (persistent), not in this queue.
+        let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16);
+
+        let lsn_table = Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new()));
+
+        // The deletion queue has an independent cancellation token to
+        // the general pageserver shutdown token, because it stays alive a bit
+        // longer to flush after Tenants have all been torn down.
+        let cancel = CancellationToken::new();
+
+        let remote_storage = match remote_storage {
+            None => {
+                return (
+                    Self {
+                        client: DeletionQueueClient {
+                            tx,
+                            executor_tx,
+                            lsn_table: lsn_table.clone(),
+                        },
+                        cancel,
+                    },
+                    None,
+                )
+            }
+            Some(r) => r,
+        };
+
+        (
+            Self {
+                client: DeletionQueueClient {
+                    tx,
+                    executor_tx: executor_tx.clone(),
+                    lsn_table: lsn_table.clone(),
+                },
+                cancel: cancel.clone(),
+            },
+            Some(DeletionQueueWorkers {
+                frontend: ListWriter::new(conf, rx, backend_tx, cancel.clone()),
+                backend: Validator::new(
+                    conf,
+                    backend_rx,
+                    executor_tx,
+                    control_plane_client,
+                    lsn_table.clone(),
+                    cancel.clone(),
+                ),
+                executor: Deleter::new(remote_storage, executor_rx, cancel.clone()),
+            }),
+        )
+    }
+
+    pub async fn shutdown(&mut self, timeout: Duration) {
+        self.cancel.cancel();
+
+        match tokio::time::timeout(timeout, self.client.flush()).await {
+            Ok(Ok(())) => {
+                tracing::info!("Deletion queue flushed successfully on shutdown")
+            }
+            Ok(Err(DeletionQueueError::ShuttingDown)) => {
+                // This is not harmful for correctness, but is unexpected: the deletion
+                // queue's workers should stay alive as long as there are any client handles instantiated.
+                tracing::warn!("Deletion queue stopped prematurely");
+            }
+            Err(_timeout) => {
+                tracing::warn!("Timed out flushing deletion queue on shutdown")
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use hex_literal::hex;
+    use std::{
+        io::ErrorKind,
+        path::{Path, PathBuf},
+        time::Duration,
+    };
+    use tracing::info;
+
+    use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
+    use tokio::task::JoinHandle;
+
+    use crate::{
+        control_plane_client::RetryForeverError,
+        repository::Key,
+        tenant::{
+            harness::TenantHarness, remote_timeline_client::remote_timeline_path,
+            storage_layer::DeltaFileName,
+        },
+    };
+
+    use super::*;
+    pub const TIMELINE_ID: TimelineId =
+        TimelineId::from_array(hex!("11223344556677881122334455667788"));
+
+    pub const EXAMPLE_LAYER_NAME: LayerFileName = LayerFileName::Delta(DeltaFileName {
+        key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
+        lsn_range: Lsn(0x00000000016B59D8)..Lsn(0x00000000016B5A51),
+    });
+
+    // When you need a second layer in a test.
+    pub const EXAMPLE_LAYER_NAME_ALT: LayerFileName = LayerFileName::Delta(DeltaFileName {
+        key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
+        lsn_range: Lsn(0x00000000016B5A51)..Lsn(0x00000000016B5A61),
+    });
+
+    struct TestSetup {
+        harness: TenantHarness,
+        remote_fs_dir: PathBuf,
+        storage: GenericRemoteStorage,
+        mock_control_plane: MockControlPlane,
+        deletion_queue: DeletionQueue,
+        worker_join: JoinHandle<()>,
+    }
+
+    impl TestSetup {
+        /// Simulate a pageserver restart by destroying and recreating the deletion queue
+        async fn restart(&mut self) {
+            let (deletion_queue, workers) = DeletionQueue::new(
+                Some(self.storage.clone()),
+                Some(self.mock_control_plane.clone()),
+                self.harness.conf,
+            );
+
+            tracing::debug!("Spawning worker for new queue queue");
+            let worker_join = workers
+                .unwrap()
+                .spawn_with(&tokio::runtime::Handle::current());
+
+            let old_worker_join = std::mem::replace(&mut self.worker_join, worker_join);
+            let old_deletion_queue = std::mem::replace(&mut self.deletion_queue, deletion_queue);
+
+            tracing::debug!("Joining worker from previous queue");
+            old_deletion_queue.cancel.cancel();
+            old_worker_join
+                .await
+                .expect("Failed to join workers for previous deletion queue");
+        }
+
+        fn set_latest_generation(&self, gen: Generation) {
+            let tenant_id = self.harness.tenant_id;
+            self.mock_control_plane
+                .latest_generation
+                .lock()
+                .unwrap()
+                .insert(tenant_id, gen);
+        }
+
+        /// Returns remote layer file name, suitable for use in assert_remote_files
+        fn write_remote_layer(
+            &self,
+            file_name: LayerFileName,
+            gen: Generation,
+        ) -> anyhow::Result<String> {
+            let tenant_id = self.harness.tenant_id;
+            let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+            let remote_timeline_path = self.remote_fs_dir.join(relative_remote_path.get_path());
+            std::fs::create_dir_all(&remote_timeline_path)?;
+            let remote_layer_file_name = format!("{}{}", file_name, gen.get_suffix());
+
+            let content: Vec<u8> = format!("placeholder contents of {file_name}").into();
+
+            std::fs::write(
+                remote_timeline_path.join(remote_layer_file_name.clone()),
+                content,
+            )?;
+
+            Ok(remote_layer_file_name)
+        }
+    }
+
+    #[derive(Debug, Clone)]
+    struct MockControlPlane {
+        pub latest_generation: std::sync::Arc<std::sync::Mutex<HashMap<TenantId, Generation>>>,
+    }
+
+    impl MockControlPlane {
+        fn new() -> Self {
+            Self {
+                latest_generation: Arc::default(),
+            }
+        }
+    }
+
+    #[async_trait::async_trait]
+    impl ControlPlaneGenerationsApi for MockControlPlane {
+        #[allow(clippy::diverging_sub_expression)] // False positive via async_trait
+        async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
+            unimplemented!()
+        }
+        async fn validate(
+            &self,
+            tenants: Vec<(TenantId, Generation)>,
+        ) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
+            let mut result = HashMap::new();
+
+            let latest_generation = self.latest_generation.lock().unwrap();
+
+            for (tenant_id, generation) in tenants {
+                if let Some(latest) = latest_generation.get(&tenant_id) {
+                    result.insert(tenant_id, *latest == generation);
+                }
+            }
+
+            Ok(result)
+        }
+    }
+
+    fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
+        let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}")));
+        let harness = TenantHarness::create(test_name)?;
+
+        // We do not load() the harness: we only need its config and remote_storage
+
+        // Set up a GenericRemoteStorage targetting a directory
+        let remote_fs_dir = harness.conf.workdir.join("remote_fs");
+        std::fs::create_dir_all(remote_fs_dir)?;
+        let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
+        let storage_config = RemoteStorageConfig {
+            max_concurrent_syncs: std::num::NonZeroUsize::new(
+                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
+            )
+            .unwrap(),
+            max_sync_errors: std::num::NonZeroU32::new(
+                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
+            )
+            .unwrap(),
+            storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
+        };
+        let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
+
+        let mock_control_plane = MockControlPlane::new();
+
+        let (deletion_queue, worker) = DeletionQueue::new(
+            Some(storage.clone()),
+            Some(mock_control_plane.clone()),
+            harness.conf,
+        );
+
+        let worker = worker.unwrap();
+        let worker_join = worker.spawn_with(&tokio::runtime::Handle::current());
+
+        Ok(TestSetup {
+            harness,
+            remote_fs_dir,
+            storage,
+            mock_control_plane,
+            deletion_queue,
+            worker_join,
+        })
+    }
+
+    // TODO: put this in a common location so that we can share with remote_timeline_client's tests
+    fn assert_remote_files(expected: &[&str], remote_path: &Path) {
+        let mut expected: Vec<String> = expected.iter().map(|x| String::from(*x)).collect();
+        expected.sort();
+
+        let mut found: Vec<String> = Vec::new();
+        let dir = match std::fs::read_dir(remote_path) {
+            Ok(d) => d,
+            Err(e) => {
+                if e.kind() == ErrorKind::NotFound {
+                    if expected.is_empty() {
+                        // We are asserting prefix is empty: it is expected that the dir is missing
+                        return;
+                    } else {
+                        assert_eq!(expected, Vec::<String>::new());
+                        unreachable!();
+                    }
+                } else {
+                    panic!(
+                        "Unexpected error listing {}: {e}",
+                        remote_path.to_string_lossy()
+                    );
+                }
+            }
+        };
+
+        for entry in dir.flatten() {
+            let entry_name = entry.file_name();
+            let fname = entry_name.to_str().unwrap();
+            found.push(String::from(fname));
+        }
+        found.sort();
+
+        assert_eq!(expected, found);
+    }
+
+    fn assert_local_files(expected: &[&str], directory: &Path) {
+        let dir = match std::fs::read_dir(directory) {
+            Ok(d) => d,
+            Err(_) => {
+                assert_eq!(expected, &Vec::<String>::new());
+                return;
+            }
+        };
+        let mut found = Vec::new();
+        for dentry in dir {
+            let dentry = dentry.unwrap();
+            let file_name = dentry.file_name();
+            let file_name_str = file_name.to_string_lossy();
+            found.push(file_name_str.to_string());
+        }
+        found.sort();
+        assert_eq!(expected, found);
+    }
+
+    #[tokio::test]
+    async fn deletion_queue_smoke() -> anyhow::Result<()> {
+        // Basic test that the deletion queue processes the deletions we pass into it
+        let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
+        let client = ctx.deletion_queue.new_client();
+        client.recover(HashMap::new()).await?;
+
+        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
+        let tenant_id = ctx.harness.tenant_id;
+
+        let content: Vec<u8> = "victim1 contents".into();
+        let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+        let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
+        let deletion_prefix = ctx.harness.conf.deletion_prefix();
+
+        // Exercise the distinction between the generation of the layers
+        // we delete, and the generation of the running Tenant.
+        let layer_generation = Generation::new(0xdeadbeef);
+        let now_generation = Generation::new(0xfeedbeef);
+
+        let remote_layer_file_name_1 =
+            format!("{}{}", layer_file_name_1, layer_generation.get_suffix());
+
+        // Set mock control plane state to valid for our generation
+        ctx.set_latest_generation(now_generation);
+
+        // Inject a victim file to remote storage
+        info!("Writing");
+        std::fs::create_dir_all(&remote_timeline_path)?;
+        std::fs::write(
+            remote_timeline_path.join(remote_layer_file_name_1.clone()),
+            content,
+        )?;
+        assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
+
+        // File should still be there after we push it to the queue (we haven't pushed enough to flush anything)
+        info!("Pushing");
+        client
+            .push_layers(
+                tenant_id,
+                TIMELINE_ID,
+                now_generation,
+                [(layer_file_name_1.clone(), layer_generation)].to_vec(),
+            )
+            .await?;
+        assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
+
+        assert_local_files(&[], &deletion_prefix);
+
+        // File should still be there after we write a deletion list (we haven't pushed enough to execute anything)
+        info!("Flushing");
+        client.flush().await?;
+        assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
+        assert_local_files(&["0000000000000001-01.list"], &deletion_prefix);
+
+        // File should go away when we execute
+        info!("Flush-executing");
+        client.flush_execute().await?;
+        assert_remote_files(&[], &remote_timeline_path);
+        assert_local_files(&["header-01"], &deletion_prefix);
+
+        // Flushing on an empty queue should succeed immediately, and not write any lists
+        info!("Flush-executing on empty");
+        client.flush_execute().await?;
+        assert_local_files(&["header-01"], &deletion_prefix);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn deletion_queue_validation() -> anyhow::Result<()> {
+        let ctx = setup("deletion_queue_validation").expect("Failed test setup");
+        let client = ctx.deletion_queue.new_client();
+        client.recover(HashMap::new()).await?;
+
+        // Generation that the control plane thinks is current
+        let latest_generation = Generation::new(0xdeadbeef);
+        // Generation that our DeletionQueue thinks the tenant is running with
+        let stale_generation = latest_generation.previous();
+        // Generation that our example layer file was written with
+        let layer_generation = stale_generation.previous();
+
+        ctx.set_latest_generation(latest_generation);
+
+        let tenant_id = ctx.harness.tenant_id;
+        let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+        let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
+
+        // Initial state: a remote layer exists
+        let remote_layer_name = ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
+        assert_remote_files(&[&remote_layer_name], &remote_timeline_path);
+
+        tracing::debug!("Pushing...");
+        client
+            .push_layers(
+                tenant_id,
+                TIMELINE_ID,
+                stale_generation,
+                [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
+            )
+            .await?;
+
+        // We enqueued the operation in a stale generation: it should have failed validation
+        tracing::debug!("Flushing...");
+        tokio::time::timeout(Duration::from_secs(5), client.flush_execute()).await??;
+        assert_remote_files(&[&remote_layer_name], &remote_timeline_path);
+
+        tracing::debug!("Pushing...");
+        client
+            .push_layers(
+                tenant_id,
+                TIMELINE_ID,
+                latest_generation,
+                [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
+            )
+            .await?;
+
+        // We enqueued the operation in a fresh generation: it should have passed validation
+        tracing::debug!("Flushing...");
+        tokio::time::timeout(Duration::from_secs(5), client.flush_execute()).await??;
+        assert_remote_files(&[], &remote_timeline_path);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn deletion_queue_recovery() -> anyhow::Result<()> {
+        // Basic test that the deletion queue processes the deletions we pass into it
+        let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
+        let client = ctx.deletion_queue.new_client();
+        client.recover(HashMap::new()).await?;
+
+        let tenant_id = ctx.harness.tenant_id;
+
+        let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+        let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
+        let deletion_prefix = ctx.harness.conf.deletion_prefix();
+
+        let layer_generation = Generation::new(0xdeadbeef);
+        let now_generation = Generation::new(0xfeedbeef);
+
+        // Inject a deletion in the generation before generation_now: after restart,
+        // this deletion should _not_ get executed (only the immediately previous
+        // generation gets that treatment)
+        let remote_layer_file_name_historical =
+            ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
+        client
+            .push_layers(
+                tenant_id,
+                TIMELINE_ID,
+                now_generation.previous(),
+                [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
+            )
+            .await?;
+
+        // Inject a deletion in the generation before generation_now: after restart,
+        // this deletion should get executed, because we execute deletions in the
+        // immediately previous generation on the same node.
+        let remote_layer_file_name_previous =
+            ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?;
+        client
+            .push_layers(
+                tenant_id,
+                TIMELINE_ID,
+                now_generation,
+                [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_generation)].to_vec(),
+            )
+            .await?;
+
+        client.flush().await?;
+        assert_remote_files(
+            &[
+                &remote_layer_file_name_historical,
+                &remote_layer_file_name_previous,
+            ],
+            &remote_timeline_path,
+        );
+
+        // Different generatinos for the same tenant will cause two separate
+        // deletion lists to be emitted.
+        assert_local_files(
+            &["0000000000000001-01.list", "0000000000000002-01.list"],
+            &deletion_prefix,
+        );
+
+        // Simulate a node restart: the latest generation advances
+        let now_generation = now_generation.next();
+        ctx.set_latest_generation(now_generation);
+
+        // Restart the deletion queue
+        drop(client);
+        ctx.restart().await;
+        let client = ctx.deletion_queue.new_client();
+        client
+            .recover(HashMap::from([(tenant_id, now_generation)]))
+            .await?;
+
+        info!("Flush-executing");
+        client.flush_execute().await?;
+        // The deletion from immediately prior generation was executed, the one from
+        // an older generation was not.
+        assert_remote_files(&[&remote_layer_file_name_historical], &remote_timeline_path);
+        Ok(())
+    }
+}
+
+/// A lightweight queue which can issue ordinary DeletionQueueClient objects, but doesn't do any persistence
+/// or coalescing, and doesn't actually execute any deletions unless you call pump() to kick it.
+#[cfg(test)]
+pub(crate) mod mock {
+    use tracing::info;
+
+    use crate::tenant::remote_timeline_client::remote_layer_path;
+
+    use super::*;
+    use std::sync::{
+        atomic::{AtomicUsize, Ordering},
+        Arc,
+    };
+
+    pub struct ConsumerState {
+        rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+        executor_rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
+    }
+
+    impl ConsumerState {
+        async fn consume(&mut self, remote_storage: &GenericRemoteStorage) -> usize {
+            let mut executed = 0;
+
+            info!("Executing all pending deletions");
+
+            // Transform all executor messages to generic frontend messages
+            while let Ok(msg) = self.executor_rx.try_recv() {
+                match msg {
+                    DeleterMessage::Delete(objects) => {
+                        for path in objects {
+                            match remote_storage.delete(&path).await {
+                                Ok(_) => {
+                                    debug!("Deleted {path}");
+                                }
+                                Err(e) => {
+                                    error!("Failed to delete {path}, leaking object! ({e})");
+                                }
+                            }
+                            executed += 1;
+                        }
+                    }
+                    DeleterMessage::Flush(flush_op) => {
+                        flush_op.notify();
+                    }
+                }
+            }
+
+            while let Ok(msg) = self.rx.try_recv() {
+                match msg {
+                    ListWriterQueueMessage::Delete(op) => {
+                        let mut objects = op.objects;
+                        for (layer, generation) in op.layers {
+                            objects.push(remote_layer_path(
+                                &op.tenant_id,
+                                &op.timeline_id,
+                                &layer,
+                                generation,
+                            ));
+                        }
+
+                        for path in objects {
+                            info!("Executing deletion {path}");
+                            match remote_storage.delete(&path).await {
+                                Ok(_) => {
+                                    debug!("Deleted {path}");
+                                }
+                                Err(e) => {
+                                    error!("Failed to delete {path}, leaking object! ({e})");
+                                }
+                            }
+                            executed += 1;
+                        }
+                    }
+                    ListWriterQueueMessage::Flush(op) => {
+                        op.notify();
+                    }
+                    ListWriterQueueMessage::FlushExecute(op) => {
+                        // We have already executed all prior deletions because mock does them inline
+                        op.notify();
+                    }
+                    ListWriterQueueMessage::Recover(_) => {
+                        // no-op in mock
+                    }
+                }
+                info!("All pending deletions have been executed");
+            }
+
+            executed
+        }
+    }
+
+    pub struct MockDeletionQueue {
+        tx: tokio::sync::mpsc::Sender<ListWriterQueueMessage>,
+        executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,
+        executed: Arc<AtomicUsize>,
+        remote_storage: Option<GenericRemoteStorage>,
+        consumer: std::sync::Mutex<ConsumerState>,
+        lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
+    }
+
+    impl MockDeletionQueue {
+        pub fn new(remote_storage: Option<GenericRemoteStorage>) -> Self {
+            let (tx, rx) = tokio::sync::mpsc::channel(16384);
+            let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16384);
+
+            let executed = Arc::new(AtomicUsize::new(0));
+
+            Self {
+                tx,
+                executor_tx,
+                executed,
+                remote_storage,
+                consumer: std::sync::Mutex::new(ConsumerState { rx, executor_rx }),
+                lsn_table: Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())),
+            }
+        }
+
+        pub fn get_executed(&self) -> usize {
+            self.executed.load(Ordering::Relaxed)
+        }
+
+        #[allow(clippy::await_holding_lock)]
+        pub async fn pump(&self) {
+            if let Some(remote_storage) = &self.remote_storage {
+                // Permit holding mutex across await, because this is only ever
+                // called once at a time in tests.
+                let mut locked = self.consumer.lock().unwrap();
+                let count = locked.consume(remote_storage).await;
+                self.executed.fetch_add(count, Ordering::Relaxed);
+            }
+        }
+
+        pub(crate) fn new_client(&self) -> DeletionQueueClient {
+            DeletionQueueClient {
+                tx: self.tx.clone(),
+                executor_tx: self.executor_tx.clone(),
+                lsn_table: self.lsn_table.clone(),
+            }
+        }
+    }
+}
diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs
new file mode 100644
index 0000000000..5c6e7dc9d7
--- /dev/null
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -0,0 +1,156 @@
+//! The deleter is the final stage in the deletion queue.  It accumulates remote
+//! paths to delete, and periodically executes them in batches of up to 1000
+//! using the DeleteObjects request.
+//!
+//! Its purpose is to increase efficiency of remote storage I/O by issuing a smaller
+//! number of full-sized DeleteObjects requests, rather than a larger number of
+//! smaller requests.
+
+use remote_storage::GenericRemoteStorage;
+use remote_storage::RemotePath;
+use remote_storage::MAX_KEYS_PER_DELETE;
+use std::time::Duration;
+use tokio_util::sync::CancellationToken;
+use tracing::info;
+use tracing::warn;
+
+use crate::metrics;
+
+use super::DeletionQueueError;
+use super::FlushOp;
+
+const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
+
+pub(super) enum DeleterMessage {
+    Delete(Vec<RemotePath>),
+    Flush(FlushOp),
+}
+
+/// Non-persistent deletion queue, for coalescing multiple object deletes into
+/// larger DeleteObjects requests.
+pub(super) struct Deleter {
+    // Accumulate up to 1000 keys for the next deletion operation
+    accumulator: Vec<RemotePath>,
+
+    rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
+
+    cancel: CancellationToken,
+    remote_storage: GenericRemoteStorage,
+}
+
+impl Deleter {
+    pub(super) fn new(
+        remote_storage: GenericRemoteStorage,
+        rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
+        cancel: CancellationToken,
+    ) -> Self {
+        Self {
+            remote_storage,
+            rx,
+            cancel,
+            accumulator: Vec::new(),
+        }
+    }
+
+    /// Wrap the remote `delete_objects` with a failpoint
+    async fn remote_delete(&self) -> Result<(), anyhow::Error> {
+        fail::fail_point!("deletion-queue-before-execute", |_| {
+            info!("Skipping execution, failpoint set");
+            metrics::DELETION_QUEUE
+                .remote_errors
+                .with_label_values(&["failpoint"])
+                .inc();
+            Err(anyhow::anyhow!("failpoint hit"))
+        });
+
+        self.remote_storage.delete_objects(&self.accumulator).await
+    }
+
+    /// Block until everything in accumulator has been executed
+    async fn flush(&mut self) -> Result<(), DeletionQueueError> {
+        while !self.accumulator.is_empty() && !self.cancel.is_cancelled() {
+            match self.remote_delete().await {
+                Ok(()) => {
+                    // Note: we assume that the remote storage layer returns Ok(()) if some
+                    // or all of the deleted objects were already gone.
+                    metrics::DELETION_QUEUE
+                        .keys_executed
+                        .inc_by(self.accumulator.len() as u64);
+                    info!(
+                        "Executed deletion batch {}..{}",
+                        self.accumulator
+                            .first()
+                            .expect("accumulator should be non-empty"),
+                        self.accumulator
+                            .last()
+                            .expect("accumulator should be non-empty"),
+                    );
+                    self.accumulator.clear();
+                }
+                Err(e) => {
+                    warn!("DeleteObjects request failed: {e:#}, will retry");
+                    metrics::DELETION_QUEUE
+                        .remote_errors
+                        .with_label_values(&["execute"])
+                        .inc();
+                }
+            };
+        }
+        if self.cancel.is_cancelled() {
+            // Expose an error because we may not have actually flushed everything
+            Err(DeletionQueueError::ShuttingDown)
+        } else {
+            Ok(())
+        }
+    }
+
+    pub(super) async fn background(&mut self) -> Result<(), DeletionQueueError> {
+        self.accumulator.reserve(MAX_KEYS_PER_DELETE);
+
+        loop {
+            if self.cancel.is_cancelled() {
+                return Err(DeletionQueueError::ShuttingDown);
+            }
+
+            let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
+                Ok(Some(m)) => m,
+                Ok(None) => {
+                    // All queue senders closed
+                    info!("Shutting down");
+                    return Err(DeletionQueueError::ShuttingDown);
+                }
+                Err(_) => {
+                    // Timeout, we hit deadline to execute whatever we have in hand.  These functions will
+                    // return immediately if no work is pending
+                    self.flush().await?;
+
+                    continue;
+                }
+            };
+
+            match msg {
+                DeleterMessage::Delete(mut list) => {
+                    while !list.is_empty() || self.accumulator.len() == MAX_KEYS_PER_DELETE {
+                        if self.accumulator.len() == MAX_KEYS_PER_DELETE {
+                            self.flush().await?;
+                            // If we have received this number of keys, proceed with attempting to execute
+                            assert_eq!(self.accumulator.len(), 0);
+                        }
+
+                        let available_slots = MAX_KEYS_PER_DELETE - self.accumulator.len();
+                        let take_count = std::cmp::min(available_slots, list.len());
+                        for path in list.drain(list.len() - take_count..) {
+                            self.accumulator.push(path);
+                        }
+                    }
+                }
+                DeleterMessage::Flush(flush_op) => {
+                    // If flush() errors, we drop the flush_op and the caller will get
+                    // an error recv()'ing their oneshot channel.
+                    self.flush().await?;
+                    flush_op.notify();
+                }
+            }
+        }
+    }
+}
diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs
new file mode 100644
index 0000000000..618a59f8fe
--- /dev/null
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -0,0 +1,487 @@
+//! The list writer is the first stage in the deletion queue.  It accumulates
+//! layers to delete, and periodically writes out these layers into a persistent
+//! DeletionList.
+//!
+//! The purpose of writing DeletionLists is to decouple the decision to
+//! delete an object from the validation required to execute it: even if
+//! validation is not possible, e.g. due to a control plane outage, we can
+//! still persist our intent to delete an object, in a way that would
+//! survive a restart.
+//!
+//! DeletionLists are passed onwards to the Validator.
+
+use super::DeletionHeader;
+use super::DeletionList;
+use super::FlushOp;
+use super::ValidatorQueueMessage;
+
+use std::collections::HashMap;
+use std::fs::create_dir_all;
+use std::time::Duration;
+
+use regex::Regex;
+use remote_storage::RemotePath;
+use tokio_util::sync::CancellationToken;
+use tracing::debug;
+use tracing::info;
+use tracing::warn;
+use utils::generation::Generation;
+use utils::id::TenantId;
+use utils::id::TimelineId;
+
+use crate::config::PageServerConf;
+use crate::deletion_queue::TEMP_SUFFIX;
+use crate::metrics;
+use crate::tenant::remote_timeline_client::remote_layer_path;
+use crate::tenant::storage_layer::LayerFileName;
+
+// The number of keys in a DeletionList before we will proactively persist it
+// (without reaching a flush deadline).  This aims to deliver objects of the order
+// of magnitude 1MB when we are under heavy delete load.
+const DELETION_LIST_TARGET_SIZE: usize = 16384;
+
+// Ordinarily, we only flush to DeletionList periodically, to bound the window during
+// which we might leak objects from not flushing a DeletionList after
+// the objects are already unlinked from timeline metadata.
+const FRONTEND_DEFAULT_TIMEOUT: Duration = Duration::from_millis(10000);
+
+// If someone is waiting for a flush to DeletionList, only delay a little to accumulate
+// more objects before doing the flush.
+const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100);
+
+#[derive(Debug)]
+pub(super) struct DeletionOp {
+    pub(super) tenant_id: TenantId,
+    pub(super) timeline_id: TimelineId,
+    // `layers` and `objects` are both just lists of objects.  `layers` is used if you do not
+    // have a config object handy to project it to a remote key, and need the consuming worker
+    // to do it for you.
+    pub(super) layers: Vec<(LayerFileName, Generation)>,
+    pub(super) objects: Vec<RemotePath>,
+
+    /// The _current_ generation of the Tenant attachment in which we are enqueuing
+    /// this deletion.
+    pub(super) generation: Generation,
+}
+
+#[derive(Debug)]
+pub(super) struct RecoverOp {
+    pub(super) attached_tenants: HashMap<TenantId, Generation>,
+}
+
+#[derive(Debug)]
+pub(super) enum ListWriterQueueMessage {
+    Delete(DeletionOp),
+    // Wait until all prior deletions make it into a persistent DeletionList
+    Flush(FlushOp),
+    // Wait until all prior deletions have been executed (i.e. objects are actually deleted)
+    FlushExecute(FlushOp),
+    // Call once after re-attaching to control plane, to notify the deletion queue about
+    // latest attached generations & load any saved deletion lists from disk.
+    Recover(RecoverOp),
+}
+
+pub(super) struct ListWriter {
+    conf: &'static PageServerConf,
+
+    // Incoming frontend requests to delete some keys
+    rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+
+    // Outbound requests to the backend to execute deletion lists we have composed.
+    tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
+
+    // The list we are currently building, contains a buffer of keys to delete
+    // and our next sequence number
+    pending: DeletionList,
+
+    // These FlushOps should notify the next time we flush
+    pending_flushes: Vec<FlushOp>,
+
+    // Worker loop is torn down when this fires.
+    cancel: CancellationToken,
+
+    // Safety guard to do recovery exactly once
+    recovered: bool,
+}
+
+impl ListWriter {
+    // Initially DeletionHeader.validated_sequence is zero.  The place we start our
+    // sequence numbers must be higher than that.
+    const BASE_SEQUENCE: u64 = 1;
+
+    pub(super) fn new(
+        conf: &'static PageServerConf,
+        rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+        tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
+        cancel: CancellationToken,
+    ) -> Self {
+        Self {
+            pending: DeletionList::new(Self::BASE_SEQUENCE),
+            conf,
+            rx,
+            tx,
+            pending_flushes: Vec::new(),
+            cancel,
+            recovered: false,
+        }
+    }
+
+    /// Try to flush `list` to persistent storage
+    ///
+    /// This does not return errors, because on failure to flush we do not lose
+    /// any state: flushing will be retried implicitly on the next deadline
+    async fn flush(&mut self) {
+        if self.pending.is_empty() {
+            for f in self.pending_flushes.drain(..) {
+                f.notify();
+            }
+            return;
+        }
+
+        match self.pending.save(self.conf).await {
+            Ok(_) => {
+                info!(sequence = self.pending.sequence, "Stored deletion list");
+
+                for f in self.pending_flushes.drain(..) {
+                    f.notify();
+                }
+
+                // Take the list we've accumulated, replace it with a fresh list for the next sequence
+                let next_list = DeletionList::new(self.pending.sequence + 1);
+                let list = std::mem::replace(&mut self.pending, next_list);
+
+                if let Err(e) = self.tx.send(ValidatorQueueMessage::Delete(list)).await {
+                    // This is allowed to fail: it will only happen if the backend worker is shut down,
+                    // so we can just drop this on the floor.
+                    info!("Deletion list dropped, this is normal during shutdown ({e:#})");
+                }
+            }
+            Err(e) => {
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
+                warn!(
+                    sequence = self.pending.sequence,
+                    "Failed to write deletion list, will retry later ({e:#})"
+                );
+            }
+        }
+    }
+
+    /// Load the header, to learn the sequence number up to which deletions
+    /// have been validated.  We will apply validated=true to DeletionLists
+    /// <= this sequence when loading them.
+    ///
+    /// It is not an error for the header to not exist: we return None, and
+    /// the caller should act as if validated_sequence is 0
+    async fn load_validated_sequence(&self) -> Result<Option<u64>, anyhow::Error> {
+        let header_path = self.conf.deletion_header_path();
+        match tokio::fs::read(&header_path).await {
+            Ok(header_bytes) => {
+                match serde_json::from_slice::<DeletionHeader>(&header_bytes) {
+                    Ok(h) => Ok(Some(h.validated_sequence)),
+                    Err(e) => {
+                        warn!(
+                            "Failed to deserialize deletion header, ignoring {}: {e:#}",
+                            header_path.display()
+                        );
+                        // This should never happen unless we make a mistake with our serialization.
+                        // Ignoring a deletion header is not consequential for correctnes because all deletions
+                        // are ultimately allowed to fail: worst case we leak some objects for the scrubber to clean up.
+                        metrics::DELETION_QUEUE.unexpected_errors.inc();
+                        Ok(None)
+                    }
+                }
+            }
+            Err(e) => {
+                if e.kind() == std::io::ErrorKind::NotFound {
+                    debug!(
+                        "Deletion header {} not found, first start?",
+                        header_path.display()
+                    );
+                    Ok(None)
+                } else {
+                    Err(anyhow::anyhow!(e))
+                }
+            }
+        }
+    }
+
+    async fn recover(
+        &mut self,
+        attached_tenants: HashMap<TenantId, Generation>,
+    ) -> Result<(), anyhow::Error> {
+        debug!(
+            "recovering with {} attached tenants",
+            attached_tenants.len()
+        );
+
+        // Load the header
+        let validated_sequence = self.load_validated_sequence().await?.unwrap_or(0);
+
+        self.pending.sequence = validated_sequence + 1;
+
+        let deletion_directory = self.conf.deletion_prefix();
+        let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
+            Ok(d) => d,
+            Err(e) => {
+                warn!(
+                    "Failed to open deletion list directory {}: {e:#}",
+                    deletion_directory.display(),
+                );
+
+                // Give up: if we can't read the deletion list directory, we probably can't
+                // write lists into it later, so the queue won't work.
+                return Err(e.into());
+            }
+        };
+
+        let list_name_pattern =
+            Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
+
+        let header_path = self.conf.deletion_header_path();
+        let mut seqs: Vec<u64> = Vec::new();
+        while let Some(dentry) = dir.next_entry().await? {
+            let file_name = dentry.file_name();
+            let dentry_str = file_name.to_string_lossy();
+
+            if Some(file_name.as_os_str()) == header_path.file_name() {
+                // Don't try and parse the header's name like a list
+                continue;
+            }
+
+            if dentry_str.ends_with(TEMP_SUFFIX) {
+                info!("Cleaning up temporary file {dentry_str}");
+                let absolute_path = deletion_directory.join(dentry.file_name());
+                if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
+                    // Non-fatal error: we will just leave the file behind but not
+                    // try and load it.
+                    warn!(
+                        "Failed to clean up temporary file {}: {e:#}",
+                        absolute_path.display()
+                    );
+                }
+
+                continue;
+            }
+
+            let file_name = dentry.file_name().to_owned();
+            let basename = file_name.to_string_lossy();
+            let seq_part = if let Some(m) = list_name_pattern.captures(&basename) {
+                m.name("sequence")
+                    .expect("Non optional group should be present")
+                    .as_str()
+            } else {
+                warn!("Unexpected key in deletion queue: {basename}");
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
+                continue;
+            };
+
+            let seq: u64 = match u64::from_str_radix(seq_part, 16) {
+                Ok(s) => s,
+                Err(e) => {
+                    warn!("Malformed key '{basename}': {e}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
+                    continue;
+                }
+            };
+            seqs.push(seq);
+        }
+        seqs.sort();
+
+        // Start our next deletion list from after the last location validated by
+        // previous process lifetime, or after the last location found (it is updated
+        // below after enumerating the deletion lists)
+        self.pending.sequence = validated_sequence + 1;
+        if let Some(max_list_seq) = seqs.last() {
+            self.pending.sequence = std::cmp::max(self.pending.sequence, max_list_seq + 1);
+        }
+
+        for s in seqs {
+            let list_path = self.conf.deletion_list_path(s);
+
+            let list_bytes = tokio::fs::read(&list_path).await?;
+
+            let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
+                Ok(l) => l,
+                Err(e) => {
+                    // Drop the list on the floor: any objects it referenced will be left behind
+                    // for scrubbing to clean up.  This should never happen unless we have a serialization bug.
+                    warn!(sequence = s, "Failed to deserialize deletion list: {e}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
+                    continue;
+                }
+            };
+
+            if deletion_list.sequence <= validated_sequence {
+                // If the deletion list falls below valid_seq, we may assume that it was
+                // already validated the last time this pageserver ran.  Otherwise, we still
+                // load it, as it may still contain content valid in this generation.
+                deletion_list.validated = true;
+            } else {
+                // Special case optimization: if a tenant is still attached, and no other
+                // generation was issued to another node in the interval while we restarted,
+                // then we may treat deletion lists from the previous generation as if they
+                // belong to our currently attached generation, and proceed to validate & execute.
+                for (tenant_id, tenant_list) in &mut deletion_list.tenants {
+                    if let Some(attached_gen) = attached_tenants.get(tenant_id) {
+                        if attached_gen.previous() == tenant_list.generation {
+                            tenant_list.generation = *attached_gen;
+                        }
+                    }
+                }
+            }
+
+            info!(
+                validated = deletion_list.validated,
+                sequence = deletion_list.sequence,
+                "Recovered deletion list"
+            );
+
+            // We will drop out of recovery if this fails: it indicates that we are shutting down
+            // or the backend has panicked
+            metrics::DELETION_QUEUE
+                .keys_submitted
+                .inc_by(deletion_list.len() as u64);
+            self.tx
+                .send(ValidatorQueueMessage::Delete(deletion_list))
+                .await?;
+        }
+
+        info!(next_sequence = self.pending.sequence, "Replay complete");
+
+        Ok(())
+    }
+
+    /// This is the front-end ingest, where we bundle up deletion requests into DeletionList
+    /// and write them out, for later validation by the backend and execution by the executor.
+    pub(super) async fn background(&mut self) {
+        info!("Started deletion frontend worker");
+
+        // Synchronous, but we only do it once per process lifetime so it's tolerable
+        if let Err(e) = create_dir_all(&self.conf.deletion_prefix()) {
+            tracing::error!(
+                "Failed to create deletion list directory {}, deletions will not be executed ({e})",
+                self.conf.deletion_prefix().display()
+            );
+            metrics::DELETION_QUEUE.unexpected_errors.inc();
+            return;
+        }
+
+        while !self.cancel.is_cancelled() {
+            let timeout = if self.pending_flushes.is_empty() {
+                FRONTEND_DEFAULT_TIMEOUT
+            } else {
+                FRONTEND_FLUSHING_TIMEOUT
+            };
+
+            let msg = match tokio::time::timeout(timeout, self.rx.recv()).await {
+                Ok(Some(msg)) => msg,
+                Ok(None) => {
+                    // Queue sender destroyed, shutting down
+                    break;
+                }
+                Err(_) => {
+                    // Hit deadline, flush.
+                    self.flush().await;
+                    continue;
+                }
+            };
+
+            match msg {
+                ListWriterQueueMessage::Delete(op) => {
+                    assert!(
+                        self.recovered,
+                        "Cannot process deletions before recovery.  This is a bug."
+                    );
+
+                    debug!(
+                        "Delete: ingesting {} layers, {} other objects",
+                        op.layers.len(),
+                        op.objects.len()
+                    );
+
+                    let mut layer_paths = Vec::new();
+                    for (layer, generation) in op.layers {
+                        layer_paths.push(remote_layer_path(
+                            &op.tenant_id,
+                            &op.timeline_id,
+                            &layer,
+                            generation,
+                        ));
+                    }
+                    layer_paths.extend(op.objects);
+
+                    if !self.pending.push(
+                        &op.tenant_id,
+                        &op.timeline_id,
+                        op.generation,
+                        &mut layer_paths,
+                    ) {
+                        self.flush().await;
+                        let retry_succeeded = self.pending.push(
+                            &op.tenant_id,
+                            &op.timeline_id,
+                            op.generation,
+                            &mut layer_paths,
+                        );
+                        if !retry_succeeded {
+                            // Unexpected: after we flush, we should have
+                            // drained self.pending, so a conflict on
+                            // generation numbers should be impossible.
+                            tracing::error!(
+                                "Failed to enqueue deletions, leaking objects.  This is a bug."
+                            );
+                            metrics::DELETION_QUEUE.unexpected_errors.inc();
+                        }
+                    }
+                }
+                ListWriterQueueMessage::Flush(op) => {
+                    if self.pending.is_empty() {
+                        // Execute immediately
+                        debug!("Flush: No pending objects, flushing immediately");
+                        op.notify()
+                    } else {
+                        // Execute next time we flush
+                        debug!("Flush: adding to pending flush list for next deadline flush");
+                        self.pending_flushes.push(op);
+                    }
+                }
+                ListWriterQueueMessage::FlushExecute(op) => {
+                    debug!("FlushExecute: passing through to backend");
+                    // We do not flush to a deletion list here: the client sends a Flush before the FlushExecute
+                    if let Err(e) = self.tx.send(ValidatorQueueMessage::Flush(op)).await {
+                        info!("Can't flush, shutting down ({e})");
+                        // Caller will get error when their oneshot sender was dropped.
+                    }
+                }
+                ListWriterQueueMessage::Recover(op) => {
+                    if self.recovered {
+                        tracing::error!(
+                            "Deletion queue recovery called more than once.  This is a bug."
+                        );
+                        metrics::DELETION_QUEUE.unexpected_errors.inc();
+                        // Non-fatal: although this is a bug, since we did recovery at least once we may proceed.
+                        continue;
+                    }
+
+                    if let Err(e) = self.recover(op.attached_tenants).await {
+                        // This should only happen in truly unrecoverable cases, like the recovery finding that the backend
+                        // queue receiver has been dropped, or something is critically broken with
+                        // the local filesystem holding deletion lists.
+                        info!(
+                            "Deletion queue recover aborted, deletion queue will not proceed ({e})"
+                        );
+                        metrics::DELETION_QUEUE.unexpected_errors.inc();
+                        return;
+                    } else {
+                        self.recovered = true;
+                    }
+                }
+            }
+
+            if self.pending.len() > DELETION_LIST_TARGET_SIZE || !self.pending_flushes.is_empty() {
+                self.flush().await;
+            }
+        }
+        info!("Deletion queue shut down.");
+    }
+}
diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs
new file mode 100644
index 0000000000..64603045d2
--- /dev/null
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -0,0 +1,414 @@
+//! The validator is responsible for validating DeletionLists for execution,
+//! based on whethe the generation in the DeletionList is still the latest
+//! generation for a tenant.
+//!
+//! The purpose of validation is to ensure split-brain safety in the cluster
+//! of pageservers: a deletion may only be executed if the tenant generation
+//! that originated it is still current.  See docs/rfcs/025-generation-numbers.md
+//! The purpose of accumulating lists before validating them is to reduce load
+//! on the control plane API by issuing fewer, larger requests.
+//!
+//! In addition to validating DeletionLists, the validator validates updates to remote_consistent_lsn
+//! for timelines: these are logically deletions because the safekeepers use remote_consistent_lsn
+//! to decide when old
+//!
+//! Deletions are passed onward to the Deleter.
+
+use std::collections::HashMap;
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::Duration;
+
+use tokio_util::sync::CancellationToken;
+use tracing::debug;
+use tracing::info;
+use tracing::warn;
+
+use crate::config::PageServerConf;
+use crate::control_plane_client::ControlPlaneGenerationsApi;
+use crate::control_plane_client::RetryForeverError;
+use crate::metrics;
+
+use super::deleter::DeleterMessage;
+use super::DeletionHeader;
+use super::DeletionList;
+use super::DeletionQueueError;
+use super::FlushOp;
+use super::VisibleLsnUpdates;
+
+// After this length of time, do any validation work that is pending,
+// even if we haven't accumulated many keys to delete.
+//
+// This also causes updates to remote_consistent_lsn to be validated, even
+// if there were no deletions enqueued.
+const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
+
+// If we have received this number of keys, proceed with attempting to execute
+const AUTOFLUSH_KEY_COUNT: usize = 16384;
+
+#[derive(Debug)]
+pub(super) enum ValidatorQueueMessage {
+    Delete(DeletionList),
+    Flush(FlushOp),
+}
+pub(super) struct Validator<C>
+where
+    C: ControlPlaneGenerationsApi,
+{
+    conf: &'static PageServerConf,
+    rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
+    tx: tokio::sync::mpsc::Sender<DeleterMessage>,
+
+    // Client for calling into control plane API for validation of deletes
+    control_plane_client: Option<C>,
+
+    // DeletionLists which are waiting generation validation.  Not safe to
+    // execute until [`validate`] has processed them.
+    pending_lists: Vec<DeletionList>,
+
+    // DeletionLists which have passed validation and are ready to execute.
+    validated_lists: Vec<DeletionList>,
+
+    // Sum of all the lengths of lists in pending_lists
+    pending_key_count: usize,
+
+    // Lsn validation state: we read projected LSNs and write back visible LSNs
+    // after validation.  This is the LSN equivalent of `pending_validation_lists`:
+    // it is drained in [`validate`]
+    lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
+
+    // If we failed to rewrite a deletion list due to local filesystem I/O failure,
+    // we must remember that and refuse to advance our persistent validated sequence
+    // number past the failure.
+    list_write_failed: Option<u64>,
+
+    cancel: CancellationToken,
+}
+
+impl<C> Validator<C>
+where
+    C: ControlPlaneGenerationsApi,
+{
+    pub(super) fn new(
+        conf: &'static PageServerConf,
+        rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
+        tx: tokio::sync::mpsc::Sender<DeleterMessage>,
+        control_plane_client: Option<C>,
+        lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
+        cancel: CancellationToken,
+    ) -> Self {
+        Self {
+            conf,
+            rx,
+            tx,
+            control_plane_client,
+            lsn_table,
+            pending_lists: Vec::new(),
+            validated_lists: Vec::new(),
+            pending_key_count: 0,
+            list_write_failed: None,
+            cancel,
+        }
+    }
+    /// Process any outstanding validations of generations of pending LSN updates or pending
+    /// DeletionLists.
+    ///
+    /// Valid LSN updates propagate back to Timelines immediately, valid DeletionLists
+    /// go into the queue of ready-to-execute lists.
+    async fn validate(&mut self) -> Result<(), DeletionQueueError> {
+        let mut tenant_generations = HashMap::new();
+        for list in &self.pending_lists {
+            for (tenant_id, tenant_list) in &list.tenants {
+                // Note: DeletionLists are in logical time order, so generation always
+                // goes up.  By doing a simple insert() we will always end up with
+                // the latest generation seen for a tenant.
+                tenant_generations.insert(*tenant_id, tenant_list.generation);
+            }
+        }
+
+        let pending_lsn_updates = {
+            let mut lsn_table = self.lsn_table.write().expect("Lock should not be poisoned");
+            std::mem::take(&mut *lsn_table)
+        };
+        for (tenant_id, update) in &pending_lsn_updates.tenants {
+            let entry = tenant_generations
+                .entry(*tenant_id)
+                .or_insert(update.generation);
+            if update.generation > *entry {
+                *entry = update.generation;
+            }
+        }
+
+        if tenant_generations.is_empty() {
+            // No work to do
+            return Ok(());
+        }
+
+        let tenants_valid = if let Some(control_plane_client) = &self.control_plane_client {
+            match control_plane_client
+                .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
+                .await
+            {
+                Ok(tenants) => tenants,
+                Err(RetryForeverError::ShuttingDown) => {
+                    // The only way a validation call returns an error is when the cancellation token fires
+                    return Err(DeletionQueueError::ShuttingDown);
+                }
+            }
+        } else {
+            // Control plane API disabled.  In legacy mode we consider everything valid.
+            tenant_generations.keys().map(|k| (*k, true)).collect()
+        };
+
+        let mut validated_sequence: Option<u64> = None;
+
+        // Apply the validation results to the pending LSN updates
+        for (tenant_id, tenant_lsn_state) in pending_lsn_updates.tenants {
+            let validated_generation = tenant_generations
+                .get(&tenant_id)
+                .expect("Map was built from the same keys we're reading");
+
+            let valid = tenants_valid
+                .get(&tenant_id)
+                .copied()
+                // If the tenant was missing from the validation response, it has been deleted.
+                // The Timeline that requested the LSN update is probably already torn down,
+                // or will be torn down soon.  In this case, drop the update by setting valid=false.
+                .unwrap_or(false);
+
+            if valid && *validated_generation == tenant_lsn_state.generation {
+                for (_timeline_id, pending_lsn) in tenant_lsn_state.timelines {
+                    pending_lsn.result_slot.store(pending_lsn.projected);
+                }
+            } else {
+                // If we failed validation, then do not apply any of the projected updates
+                warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
+                metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
+            }
+        }
+
+        // Apply the validation results to the pending deletion lists
+        for list in &mut self.pending_lists {
+            // Filter the list based on whether the server responded valid: true.
+            // If a tenant is omitted in the response, it has been deleted, and we should
+            // proceed with deletion.
+            let mut mutated = false;
+            list.tenants.retain(|tenant_id, tenant| {
+                let validated_generation = tenant_generations
+                    .get(tenant_id)
+                    .expect("Map was built from the same keys we're reading");
+
+                // If the tenant was missing from the validation response, it has been deleted.
+                // This means that a deletion is valid, but also redundant since the tenant's
+                // objects should have already been deleted.  Treat it as invalid to drop the
+                // redundant deletion.
+                let valid = tenants_valid.get(tenant_id).copied().unwrap_or(false);
+
+                // A list is valid if it comes from the current _or previous_ generation.
+                // - The previous generation case is permitted due to how we store deletion lists locally:
+                // if we see the immediately previous generation in a locally stored deletion list,
+                // it proves that this node's disk was used for both current & previous generations,
+                // and therefore no other node was involved in between: the two generations may be
+                // logically treated as the same.
+                // - In that previous generation case, we rewrote it to the current generation
+                // in recover(), so the comparison here is simply an equality.
+
+                let this_list_valid = valid
+                    && (tenant.generation == *validated_generation);
+
+                if !this_list_valid {
+                    warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
+                    metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
+                    mutated = true;
+                }
+                this_list_valid
+            });
+            list.validated = true;
+
+            if mutated {
+                // Save the deletion list if we had to make changes due to stale generations.  The
+                // saved list is valid for execution.
+                if let Err(e) = list.save(self.conf).await {
+                    // Highly unexpected.  Could happen if e.g. disk full.
+                    // If we didn't save the trimmed list, it is _not_ valid to execute.
+                    warn!("Failed to save modified deletion list {list}: {e:#}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
+
+                    // Rather than have a complex retry process, just drop it and leak the objects,
+                    // scrubber will clean up eventually.
+                    list.tenants.clear(); // Result is a valid-but-empty list, which is a no-op for execution.
+
+                    // We must remember this failure, to prevent later writing out a header that
+                    // would imply the unwritable list was valid on disk.
+                    if self.list_write_failed.is_none() {
+                        self.list_write_failed = Some(list.sequence);
+                    }
+                }
+            }
+
+            validated_sequence = Some(list.sequence);
+        }
+
+        if let Some(validated_sequence) = validated_sequence {
+            if let Some(list_write_failed) = self.list_write_failed {
+                // Rare error case: we failed to write out a deletion list to excise invalid
+                // entries, so we cannot advance the header's valid sequence number past that point.
+                //
+                // In this state we will continue to validate, execute and delete deletion lists,
+                // we just cannot update the header.  It should be noticed and fixed by a human due to
+                // the nonzero value of our unexpected_errors metric.
+                warn!(
+                    sequence_number = list_write_failed,
+                    "Cannot write header because writing a deletion list failed earlier",
+                );
+            } else {
+                // Write the queue header to record how far validation progressed.  This avoids having
+                // to rewrite each DeletionList to set validated=true in it.
+                let header = DeletionHeader::new(validated_sequence);
+
+                // Drop result because the validated_sequence is an optimization.  If we fail to save it,
+                // then restart, we will drop some deletion lists, creating work for scrubber.
+                // The save() function logs a warning on error.
+                if let Err(e) = header.save(self.conf).await {
+                    warn!("Failed to write deletion queue header: {e:#}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
+                }
+            }
+        }
+
+        // Transfer the validated lists to the validated queue, for eventual execution
+        self.validated_lists.append(&mut self.pending_lists);
+
+        Ok(())
+    }
+
+    async fn cleanup_lists(&mut self, list_paths: Vec<PathBuf>) {
+        for list_path in list_paths {
+            debug!("Removing deletion list {}", list_path.display());
+
+            if let Err(e) = tokio::fs::remove_file(&list_path).await {
+                // Unexpected: we should have permissions and nothing else should
+                // be touching these files.  We will leave the file behind.  Subsequent
+                // pageservers will try and load it again: hopefully whatever storage
+                // issue (probably permissions) has been fixed by then.
+                tracing::error!("Failed to delete {}: {e:#}", list_path.display());
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
+                break;
+            }
+        }
+    }
+
+    async fn flush(&mut self) -> Result<(), DeletionQueueError> {
+        tracing::debug!("Flushing with {} pending lists", self.pending_lists.len());
+
+        // Issue any required generation validation calls to the control plane
+        self.validate().await?;
+
+        // After successful validation, nothing is pending: any lists that
+        // made it through validation will be in validated_lists.
+        assert!(self.pending_lists.is_empty());
+        self.pending_key_count = 0;
+
+        tracing::debug!(
+            "Validation complete, have {} validated lists",
+            self.validated_lists.len()
+        );
+
+        // Return quickly if we have no validated lists to execute.  This avoids flushing the
+        // executor when an idle backend hits its autoflush interval
+        if self.validated_lists.is_empty() {
+            return Ok(());
+        }
+
+        // Drain `validated_lists` into the executor
+        let mut executing_lists = Vec::new();
+        for list in self.validated_lists.drain(..) {
+            let list_path = self.conf.deletion_list_path(list.sequence);
+            let objects = list.into_remote_paths();
+            self.tx
+                .send(DeleterMessage::Delete(objects))
+                .await
+                .map_err(|_| DeletionQueueError::ShuttingDown)?;
+            executing_lists.push(list_path);
+        }
+
+        self.flush_executor().await?;
+
+        // Erase the deletion lists whose keys have all be deleted from remote storage
+        self.cleanup_lists(executing_lists).await;
+
+        Ok(())
+    }
+
+    async fn flush_executor(&mut self) -> Result<(), DeletionQueueError> {
+        // Flush the executor, so that all the keys referenced by these deletion lists
+        // are actually removed from remote storage.  This is a precondition to deleting
+        // the deletion lists themselves.
+        let (flush_op, rx) = FlushOp::new();
+        self.tx
+            .send(DeleterMessage::Flush(flush_op))
+            .await
+            .map_err(|_| DeletionQueueError::ShuttingDown)?;
+
+        rx.await.map_err(|_| DeletionQueueError::ShuttingDown)
+    }
+
+    pub(super) async fn background(&mut self) {
+        tracing::info!("Started deletion backend worker");
+
+        while !self.cancel.is_cancelled() {
+            let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
+                Ok(Some(m)) => m,
+                Ok(None) => {
+                    // All queue senders closed
+                    info!("Shutting down");
+                    break;
+                }
+                Err(_) => {
+                    // Timeout, we hit deadline to execute whatever we have in hand.  These functions will
+                    // return immediately if no work is pending.
+                    match self.flush().await {
+                        Ok(()) => {}
+                        Err(DeletionQueueError::ShuttingDown) => {
+                            // If we are shutting down, then auto-flush can safely be skipped
+                        }
+                    }
+
+                    continue;
+                }
+            };
+
+            match msg {
+                ValidatorQueueMessage::Delete(list) => {
+                    if list.validated {
+                        // A pre-validated list may only be seen during recovery, if we are recovering
+                        // a DeletionList whose on-disk state has validated=true
+                        self.validated_lists.push(list)
+                    } else {
+                        self.pending_key_count += list.len();
+                        self.pending_lists.push(list);
+                    }
+
+                    if self.pending_key_count > AUTOFLUSH_KEY_COUNT {
+                        match self.flush().await {
+                            Ok(()) => {}
+                            Err(DeletionQueueError::ShuttingDown) => {
+                                // If we are shutting down, then auto-flush can safely be skipped
+                            }
+                        }
+                    }
+                }
+                ValidatorQueueMessage::Flush(op) => {
+                    match self.flush().await {
+                        Ok(()) => {
+                            op.notify();
+                        }
+                        Err(DeletionQueueError::ShuttingDown) => {
+                            // If we fail due to shutting down, we will just drop `op` to propagate that status.
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 4988641d6a..f5c1224f01 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1093,6 +1093,9 @@ components:
         remote_consistent_lsn:
           type: string
           format: hex
+        remote_consistent_lsn_visible:
+          type: string
+          format: hex
         ancestor_timeline_id:
           type: string
           format: hex
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index a8e914ba08..e61a9dcf3f 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -5,6 +5,7 @@ use std::collections::HashMap;
 use std::sync::Arc;
 
 use anyhow::{anyhow, Context, Result};
+use futures::TryFutureExt;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
@@ -24,6 +25,7 @@ use super::models::{
     TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
 };
 use crate::context::{DownloadBehavior, RequestContext};
+use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
@@ -34,7 +36,7 @@ use crate::tenant::mgr::{
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::timeline::Timeline;
-use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
+use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use utils::{
@@ -61,6 +63,7 @@ pub struct State {
     remote_storage: Option<GenericRemoteStorage>,
     broker_client: storage_broker::BrokerClientChannel,
     disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
+    deletion_queue_client: DeletionQueueClient,
 }
 
 impl State {
@@ -70,6 +73,7 @@ impl State {
         remote_storage: Option<GenericRemoteStorage>,
         broker_client: storage_broker::BrokerClientChannel,
         disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
+        deletion_queue_client: DeletionQueueClient,
     ) -> anyhow::Result<Self> {
         let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
             .iter()
@@ -82,8 +86,17 @@ impl State {
             remote_storage,
             broker_client,
             disk_usage_eviction_state,
+            deletion_queue_client,
         })
     }
+
+    fn tenant_resources(&self) -> TenantSharedResources {
+        TenantSharedResources {
+            broker_client: self.broker_client.clone(),
+            remote_storage: self.remote_storage.clone(),
+            deletion_queue_client: self.deletion_queue_client.clone(),
+        }
+    }
 }
 
 #[inline(always)]
@@ -283,7 +296,12 @@ async fn build_timeline_info_common(
     };
     let current_physical_size = Some(timeline.layer_size_sum().await);
     let state = timeline.current_state();
-    let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
+    let remote_consistent_lsn_projected = timeline
+        .get_remote_consistent_lsn_projected()
+        .unwrap_or(Lsn(0));
+    let remote_consistent_lsn_visible = timeline
+        .get_remote_consistent_lsn_visible()
+        .unwrap_or(Lsn(0));
 
     let walreceiver_status = timeline.walreceiver_status();
 
@@ -293,7 +311,8 @@ async fn build_timeline_info_common(
         ancestor_timeline_id,
         ancestor_lsn,
         disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
-        remote_consistent_lsn,
+        remote_consistent_lsn: remote_consistent_lsn_projected,
+        remote_consistent_lsn_visible,
         last_record_lsn,
         prev_record_lsn: Some(timeline.get_prev_record_lsn()),
         latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
@@ -492,24 +511,23 @@ async fn tenant_attach_handler(
 
     let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
 
-    if let Some(remote_storage) = &state.remote_storage {
-        mgr::attach_tenant(
-            state.conf,
-            tenant_id,
-            generation,
-            tenant_conf,
-            state.broker_client.clone(),
-            remote_storage.clone(),
-            &ctx,
-        )
-        .instrument(info_span!("tenant_attach", %tenant_id))
-        .await?;
-    } else {
+    if state.remote_storage.is_none() {
         return Err(ApiError::BadRequest(anyhow!(
             "attach_tenant is not possible because pageserver was configured without remote storage"
         )));
     }
 
+    mgr::attach_tenant(
+        state.conf,
+        tenant_id,
+        generation,
+        tenant_conf,
+        state.tenant_resources(),
+        &ctx,
+    )
+    .instrument(info_span!("tenant_attach", %tenant_id))
+    .await?;
+
     json_response(StatusCode::ACCEPTED, ())
 }
 
@@ -570,6 +588,7 @@ async fn tenant_load_handler(
         generation,
         state.broker_client.clone(),
         state.remote_storage.clone(),
+        state.deletion_queue_client.clone(),
         &ctx,
     )
     .instrument(info_span!("load", %tenant_id))
@@ -911,8 +930,7 @@ async fn tenant_create_handler(
         tenant_conf,
         target_tenant_id,
         generation,
-        state.broker_client.clone(),
-        state.remote_storage.clone(),
+        state.tenant_resources(),
         &ctx,
     )
     .instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
@@ -1129,6 +1147,39 @@ async fn timeline_download_remote_layers_handler_get(
     json_response(StatusCode::OK, info)
 }
 
+async fn deletion_queue_flush(
+    r: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&r);
+
+    if state.remote_storage.is_none() {
+        // Nothing to do if remote storage is disabled.
+        return json_response(StatusCode::OK, ());
+    }
+
+    let execute = parse_query_param(&r, "execute")?.unwrap_or(false);
+
+    let flush = async {
+        if execute {
+            state.deletion_queue_client.flush_execute().await
+        } else {
+            state.deletion_queue_client.flush().await
+        }
+    }
+    // DeletionQueueError's only case is shutting down.
+    .map_err(|_| ApiError::ShuttingDown);
+
+    tokio::select! {
+        res = flush => {
+            res.map(|()| json_response(StatusCode::OK, ()))?
+        }
+        _ = cancel.cancelled() => {
+            Err(ApiError::ShuttingDown)
+        }
+    }
+}
+
 async fn active_timeline_of_active_tenant(
     tenant_id: TenantId,
     timeline_id: TimelineId,
@@ -1463,6 +1514,9 @@ pub fn make_router(
         .put("/v1/disk_usage_eviction/run", |r| {
             api_handler(r, disk_usage_eviction_run)
         })
+        .put("/v1/deletion_queue/flush", |r| {
+            api_handler(r, deletion_queue_flush)
+        })
         .put("/v1/tenant/:tenant_id/break", |r| {
             testing_api_handler("set tenant state to broken", r, handle_tenant_break)
         })
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 3049ad6a4e..e370e063ba 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -3,7 +3,8 @@ pub mod basebackup;
 pub mod config;
 pub mod consumption_metrics;
 pub mod context;
-mod control_plane_client;
+pub mod control_plane_client;
+pub mod deletion_queue;
 pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
@@ -27,6 +28,7 @@ pub mod failpoint_support;
 use std::path::Path;
 
 use crate::task_mgr::TaskKind;
+use deletion_queue::DeletionQueue;
 use tracing::info;
 
 /// Current storage format version
@@ -48,8 +50,8 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
 
 pub use crate::metrics::preinitialize_metrics;
 
-#[tracing::instrument]
-pub async fn shutdown_pageserver(exit_code: i32) {
+#[tracing::instrument(skip_all, fields(%exit_code))]
+pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_code: i32) {
     use std::time::Duration;
     // Shut down the libpq endpoint task. This prevents new connections from
     // being accepted.
@@ -77,6 +79,11 @@ pub async fn shutdown_pageserver(exit_code: i32) {
     )
     .await;
 
+    // Best effort to persist any outstanding deletions, to avoid leaking objects
+    if let Some(mut deletion_queue) = deletion_queue {
+        deletion_queue.shutdown(Duration::from_secs(5)).await;
+    }
+
     // Shut down the HTTP endpoint last, so that you can still check the server's
     // status while it's shutting down.
     // FIXME: We should probably stop accepting commands like attach/detach earlier.
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 98dee095a3..f85f525630 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -264,6 +264,46 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
     },
 });
 
+pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_page_cache_acquire_pinned_slot_seconds",
+        "Time spent acquiring a pinned slot in the page cache",
+        CRITICAL_OP_BUCKETS.into(),
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_page_cache_find_victim_iters_total",
+        "Counter for the number of iterations in the find_victim loop",
+    )
+    .expect("failed to define a metric")
+});
+
+static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "page_cache_errors_total",
+        "Number of timeouts while acquiring a pinned slot in the page cache",
+        &["error_kind"]
+    )
+    .expect("failed to define a metric")
+});
+
+#[derive(IntoStaticStr)]
+#[strum(serialize_all = "kebab_case")]
+pub(crate) enum PageCacheErrorKind {
+    AcquirePinnedSlotTimeout,
+    EvictIterLimit,
+}
+
+pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) {
+    PAGE_CACHE_ERRORS
+        .get_metric_with_label_values(&[error_kind.into()])
+        .unwrap()
+        .inc();
+}
+
 pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_wait_lsn_seconds",
@@ -291,6 +331,14 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_resident_physical_size_global",
+        "Like `pageserver_resident_physical_size`, but without tenant/timeline dimensions."
+    )
+    .expect("failed to define a metric")
+});
+
 static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_remote_physical_size",
@@ -301,6 +349,14 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static REMOTE_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_remote_physical_size_global",
+        "Like `pageserver_remote_physical_size`, but without tenant/timeline dimensions."
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "pageserver_remote_ondemand_downloaded_layers_total",
@@ -887,6 +943,54 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy
     .expect("failed to define a metric")
 });
 
+pub(crate) struct DeletionQueueMetrics {
+    pub(crate) keys_submitted: IntCounter,
+    pub(crate) keys_dropped: IntCounter,
+    pub(crate) keys_executed: IntCounter,
+    pub(crate) dropped_lsn_updates: IntCounter,
+    pub(crate) unexpected_errors: IntCounter,
+    pub(crate) remote_errors: IntCounterVec,
+}
+pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
+    DeletionQueueMetrics{
+
+    keys_submitted: register_int_counter!(
+        "pageserver_deletion_queue_submitted_total",
+        "Number of objects submitted for deletion"
+    )
+    .expect("failed to define a metric"),
+
+    keys_dropped: register_int_counter!(
+        "pageserver_deletion_queue_dropped_total",
+        "Number of object deletions dropped due to stale generation."
+    )
+    .expect("failed to define a metric"),
+
+    keys_executed: register_int_counter!(
+        "pageserver_deletion_queue_executed_total",
+        "Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed."
+    )
+    .expect("failed to define a metric"),
+
+    dropped_lsn_updates: register_int_counter!(
+        "pageserver_deletion_queue_dropped_lsn_updates_total",
+        "Updates to remote_consistent_lsn dropped due to stale generation number."
+    )
+    .expect("failed to define a metric"),
+    unexpected_errors: register_int_counter!(
+        "pageserver_deletion_queue_unexpected_errors_total",
+        "Number of unexpected condiions that may stall the queue: any value above zero is unexpected."
+    )
+    .expect("failed to define a metric"),
+    remote_errors: register_int_counter_vec!(
+        "pageserver_deletion_queue_remote_errors_total",
+        "Retryable remote I/O errors while executing deletions, for example 503 responses to DeleteObjects",
+        &["op_kind"],
+    )
+    .expect("failed to define a metric")
+}
+});
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
     Upload,
@@ -1161,7 +1265,7 @@ pub struct TimelineMetrics {
     pub load_layer_map_histo: StorageTimeMetrics,
     pub garbage_collect_histo: StorageTimeMetrics,
     pub last_record_gauge: IntGauge,
-    pub resident_physical_size_gauge: UIntGauge,
+    resident_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
     pub current_logical_size_gauge: UIntGauge,
     pub num_persistent_files_created: IntCounter,
@@ -1239,10 +1343,29 @@ impl TimelineMetrics {
     }
 
     pub fn record_new_file_metrics(&self, sz: u64) {
-        self.resident_physical_size_gauge.add(sz);
+        self.resident_physical_size_add(sz);
         self.num_persistent_files_created.inc_by(1);
         self.persistent_bytes_written.inc_by(sz);
     }
+
+    pub fn resident_physical_size_sub(&self, sz: u64) {
+        self.resident_physical_size_gauge.sub(sz);
+        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
+    }
+
+    pub fn resident_physical_size_add(&self, sz: u64) {
+        self.resident_physical_size_gauge.add(sz);
+        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
+    }
+
+    pub fn resident_physical_size_set(&self, sz: u64) {
+        self.resident_physical_size_gauge.set(sz);
+        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.set(sz);
+    }
+
+    pub fn resident_physical_size_get(&self) -> u64 {
+        self.resident_physical_size_gauge.get()
+    }
 }
 
 impl Drop for TimelineMetrics {
@@ -1250,7 +1373,10 @@ impl Drop for TimelineMetrics {
         let tenant_id = &self.tenant_id;
         let timeline_id = &self.timeline_id;
         let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+        {
+            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
+            let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+        }
         let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
         let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
         let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
@@ -1304,10 +1430,43 @@ use std::time::{Duration, Instant};
 use crate::context::{PageContentKind, RequestContext};
 use crate::task_mgr::TaskKind;
 
+/// Maintain a per timeline gauge in addition to the global gauge.
+struct PerTimelineRemotePhysicalSizeGauge {
+    last_set: u64,
+    gauge: UIntGauge,
+}
+
+impl PerTimelineRemotePhysicalSizeGauge {
+    fn new(per_timeline_gauge: UIntGauge) -> Self {
+        Self {
+            last_set: per_timeline_gauge.get(),
+            gauge: per_timeline_gauge,
+        }
+    }
+    fn set(&mut self, sz: u64) {
+        self.gauge.set(sz);
+        if sz < self.last_set {
+            REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set - sz);
+        } else {
+            REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - self.last_set);
+        };
+        self.last_set = sz;
+    }
+    fn get(&self) -> u64 {
+        self.gauge.get()
+    }
+}
+
+impl Drop for PerTimelineRemotePhysicalSizeGauge {
+    fn drop(&mut self) {
+        REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set);
+    }
+}
+
 pub struct RemoteTimelineClientMetrics {
     tenant_id: String,
     timeline_id: String,
-    remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
+    remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
     calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
     bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
     bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
@@ -1325,18 +1484,24 @@ impl RemoteTimelineClientMetrics {
         }
     }
 
-    pub fn remote_physical_size_gauge(&self) -> UIntGauge {
+    pub(crate) fn remote_physical_size_set(&self, sz: u64) {
         let mut guard = self.remote_physical_size_gauge.lock().unwrap();
-        guard
-            .get_or_insert_with(|| {
+        let gauge = guard.get_or_insert_with(|| {
+            PerTimelineRemotePhysicalSizeGauge::new(
                 REMOTE_PHYSICAL_SIZE
                     .get_metric_with_label_values(&[
                         &self.tenant_id.to_string(),
                         &self.timeline_id.to_string(),
                     ])
-                    .unwrap()
-            })
-            .clone()
+                    .unwrap(),
+            )
+        });
+        gauge.set(sz);
+    }
+
+    pub(crate) fn remote_physical_size_get(&self) -> u64 {
+        let guard = self.remote_physical_size_gauge.lock().unwrap();
+        guard.as_ref().map(|gauge| gauge.get()).unwrap_or(0)
     }
 
     pub fn remote_operation_time(
@@ -1675,6 +1840,9 @@ pub fn preinitialize_metrics() {
         Lazy::force(c);
     });
 
+    // Deletion queue stats
+    Lazy::force(&DELETION_QUEUE);
+
     // countervecs
     [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
         .into_iter()
diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs
index 38b169ea85..97ca2bfea7 100644
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -75,7 +75,11 @@
 use std::{
     collections::{hash_map::Entry, HashMap},
     convert::TryInto,
-    sync::atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
+    sync::{
+        atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
+        Arc, Weak,
+    },
+    time::Duration,
 };
 
 use anyhow::Context;
@@ -165,6 +169,8 @@ struct Slot {
 
 struct SlotInner {
     key: Option<CacheKey>,
+    // for `coalesce_readers_permit`
+    permit: std::sync::Mutex<Weak<PinnedSlotsPermit>>,
     buf: &'static mut [u8; PAGE_SZ],
 }
 
@@ -207,6 +213,22 @@ impl Slot {
     }
 }
 
+impl SlotInner {
+    /// If there is aready a reader, drop our permit and share its permit, just like we share read access.
+    fn coalesce_readers_permit(&self, permit: PinnedSlotsPermit) -> Arc<PinnedSlotsPermit> {
+        let mut guard = self.permit.lock().unwrap();
+        if let Some(existing_permit) = guard.upgrade() {
+            drop(guard);
+            drop(permit);
+            existing_permit
+        } else {
+            let permit = Arc::new(permit);
+            *guard = Arc::downgrade(&permit);
+            permit
+        }
+    }
+}
+
 pub struct PageCache {
     /// This contains the mapping from the cache key to buffer slot that currently
     /// contains the page, if any.
@@ -224,6 +246,8 @@ pub struct PageCache {
     /// The actual buffers with their metadata.
     slots: Box<[Slot]>,
 
+    pinned_slots: Arc<tokio::sync::Semaphore>,
+
     /// Index of the next candidate to evict, for the Clock replacement algorithm.
     /// This is interpreted modulo the page cache size.
     next_evict_slot: AtomicUsize,
@@ -231,23 +255,28 @@ pub struct PageCache {
     size_metrics: &'static PageCacheSizeMetrics,
 }
 
+struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
+
 ///
 /// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
 /// until the guard is dropped.
 ///
-pub struct PageReadGuard<'i>(tokio::sync::RwLockReadGuard<'i, SlotInner>);
+pub struct PageReadGuard<'i> {
+    _permit: Arc<PinnedSlotsPermit>,
+    slot_guard: tokio::sync::RwLockReadGuard<'i, SlotInner>,
+}
 
 impl std::ops::Deref for PageReadGuard<'_> {
     type Target = [u8; PAGE_SZ];
 
     fn deref(&self) -> &Self::Target {
-        self.0.buf
+        self.slot_guard.buf
     }
 }
 
 impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
     fn as_ref(&self) -> &[u8; PAGE_SZ] {
-        self.0.buf
+        self.slot_guard.buf
     }
 }
 
@@ -264,6 +293,8 @@ impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
 pub struct PageWriteGuard<'i> {
     inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
 
+    _permit: PinnedSlotsPermit,
+
     // Are the page contents currently valid?
     // Used to mark pages as invalid that are assigned but not yet filled with data.
     valid: bool,
@@ -348,6 +379,10 @@ impl PageCache {
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Option<(Lsn, PageReadGuard)> {
+        let Ok(permit) = self.try_get_pinned_slot_permit().await else {
+            return None;
+        };
+
         crate::metrics::PAGE_CACHE
             .for_ctx(ctx)
             .read_accesses_materialized_page
@@ -362,7 +397,10 @@ impl PageCache {
             lsn,
         };
 
-        if let Some(guard) = self.try_lock_for_read(&mut cache_key).await {
+        if let Some(guard) = self
+            .try_lock_for_read(&mut cache_key, &mut Some(permit))
+            .await
+        {
             if let CacheKey::MaterializedPage {
                 hash_key: _,
                 lsn: available_lsn,
@@ -445,6 +483,29 @@ impl PageCache {
     // "mappings" after this section. But the routines in this section should
     // not require changes.
 
+    async fn try_get_pinned_slot_permit(&self) -> anyhow::Result<PinnedSlotsPermit> {
+        let timer = crate::metrics::PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME.start_timer();
+        match tokio::time::timeout(
+            // Choose small timeout, neon_smgr does its own retries.
+            // https://neondb.slack.com/archives/C04DGM6SMTM/p1694786876476869
+            Duration::from_secs(10),
+            Arc::clone(&self.pinned_slots).acquire_owned(),
+        )
+        .await
+        {
+            Ok(res) => Ok(PinnedSlotsPermit(
+                res.expect("this semaphore is never closed"),
+            )),
+            Err(_timeout) => {
+                timer.stop_and_discard();
+                crate::metrics::page_cache_errors_inc(
+                    crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout,
+                );
+                anyhow::bail!("timeout: there were page guards alive for all page cache slots")
+            }
+        }
+    }
+
     /// Look up a page in the cache.
     ///
     /// If the search criteria is not exact, *cache_key is updated with the key
@@ -454,7 +515,11 @@ impl PageCache {
     ///
     /// If no page is found, returns None and *cache_key is left unmodified.
     ///
-    async fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
+    async fn try_lock_for_read(
+        &self,
+        cache_key: &mut CacheKey,
+        permit: &mut Option<PinnedSlotsPermit>,
+    ) -> Option<PageReadGuard> {
         let cache_key_orig = cache_key.clone();
         if let Some(slot_idx) = self.search_mapping(cache_key) {
             // The page was found in the mapping. Lock the slot, and re-check
@@ -464,7 +529,10 @@ impl PageCache {
             let inner = slot.inner.read().await;
             if inner.key.as_ref() == Some(cache_key) {
                 slot.inc_usage_count();
-                return Some(PageReadGuard(inner));
+                return Some(PageReadGuard {
+                    _permit: inner.coalesce_readers_permit(permit.take().unwrap()),
+                    slot_guard: inner,
+                });
             } else {
                 // search_mapping might have modified the search key; restore it.
                 *cache_key = cache_key_orig;
@@ -507,6 +575,8 @@ impl PageCache {
         cache_key: &mut CacheKey,
         ctx: &RequestContext,
     ) -> anyhow::Result<ReadBufResult> {
+        let mut permit = Some(self.try_get_pinned_slot_permit().await?);
+
         let (read_access, hit) = match cache_key {
             CacheKey::MaterializedPage { .. } => {
                 unreachable!("Materialized pages use lookup_materialized_page")
@@ -523,17 +593,21 @@ impl PageCache {
         let mut is_first_iteration = true;
         loop {
             // First check if the key already exists in the cache.
-            if let Some(read_guard) = self.try_lock_for_read(cache_key).await {
+            if let Some(read_guard) = self.try_lock_for_read(cache_key, &mut permit).await {
+                debug_assert!(permit.is_none());
                 if is_first_iteration {
                     hit.inc();
                 }
                 return Ok(ReadBufResult::Found(read_guard));
             }
+            debug_assert!(permit.is_some());
             is_first_iteration = false;
 
             // Not found. Find a victim buffer
-            let (slot_idx, mut inner) =
-                self.find_victim().context("Failed to find evict victim")?;
+            let (slot_idx, mut inner) = self
+                .find_victim(permit.as_ref().unwrap())
+                .await
+                .context("Failed to find evict victim")?;
 
             // Insert mapping for this. At this point, we may find that another
             // thread did the same thing concurrently. In that case, we evicted
@@ -555,7 +629,16 @@ impl PageCache {
             inner.key = Some(cache_key.clone());
             slot.set_usage_count(1);
 
+            debug_assert!(
+                {
+                    let guard = inner.permit.lock().unwrap();
+                    guard.upgrade().is_none()
+                },
+                "we hold a write lock, so, no one else should have a permit"
+            );
+
             return Ok(ReadBufResult::NotFound(PageWriteGuard {
+                _permit: permit.take().unwrap(),
                 inner,
                 valid: false,
             }));
@@ -566,7 +649,11 @@ impl PageCache {
     /// found, returns None.
     ///
     /// When locking a page for writing, the search criteria is always "exact".
-    async fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
+    async fn try_lock_for_write(
+        &self,
+        cache_key: &CacheKey,
+        permit: &mut Option<PinnedSlotsPermit>,
+    ) -> Option<PageWriteGuard> {
         if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
             // The page was found in the mapping. Lock the slot, and re-check
             // that it's still what we expected (because we don't released the mapping
@@ -575,7 +662,18 @@ impl PageCache {
             let inner = slot.inner.write().await;
             if inner.key.as_ref() == Some(cache_key) {
                 slot.inc_usage_count();
-                return Some(PageWriteGuard { inner, valid: true });
+                debug_assert!(
+                    {
+                        let guard = inner.permit.lock().unwrap();
+                        guard.upgrade().is_none()
+                    },
+                    "we hold a write lock, so, no one else should have a permit"
+                );
+                return Some(PageWriteGuard {
+                    _permit: permit.take().unwrap(),
+                    inner,
+                    valid: true,
+                });
             }
         }
         None
@@ -586,15 +684,20 @@ impl PageCache {
     /// Similar to lock_for_read(), but the returned buffer is write-locked and
     /// may be modified by the caller even if it's already found in the cache.
     async fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
+        let mut permit = Some(self.try_get_pinned_slot_permit().await?);
         loop {
             // First check if the key already exists in the cache.
-            if let Some(write_guard) = self.try_lock_for_write(cache_key).await {
+            if let Some(write_guard) = self.try_lock_for_write(cache_key, &mut permit).await {
+                debug_assert!(permit.is_none());
                 return Ok(WriteBufResult::Found(write_guard));
             }
+            debug_assert!(permit.is_some());
 
             // Not found. Find a victim buffer
-            let (slot_idx, mut inner) =
-                self.find_victim().context("Failed to find evict victim")?;
+            let (slot_idx, mut inner) = self
+                .find_victim(permit.as_ref().unwrap())
+                .await
+                .context("Failed to find evict victim")?;
 
             // Insert mapping for this. At this point, we may find that another
             // thread did the same thing concurrently. In that case, we evicted
@@ -616,7 +719,16 @@ impl PageCache {
             inner.key = Some(cache_key.clone());
             slot.set_usage_count(1);
 
+            debug_assert!(
+                {
+                    let guard = inner.permit.lock().unwrap();
+                    guard.upgrade().is_none()
+                },
+                "we hold a write lock, so, no one else should have a permit"
+            );
+
             return Ok(WriteBufResult::NotFound(PageWriteGuard {
+                _permit: permit.take().unwrap(),
                 inner,
                 valid: false,
             }));
@@ -769,7 +881,10 @@ impl PageCache {
     /// Find a slot to evict.
     ///
     /// On return, the slot is empty and write-locked.
-    fn find_victim(&self) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
+    async fn find_victim(
+        &self,
+        _permit_witness: &PinnedSlotsPermit,
+    ) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
         let iter_limit = self.slots.len() * 10;
         let mut iters = 0;
         loop {
@@ -782,13 +897,40 @@ impl PageCache {
                 let mut inner = match slot.inner.try_write() {
                     Ok(inner) => inner,
                     Err(_err) => {
-                        // If we have looped through the whole buffer pool 10 times
-                        // and still haven't found a victim buffer, something's wrong.
-                        // Maybe all the buffers were in locked. That could happen in
-                        // theory, if you have more threads holding buffers locked than
-                        // there are buffers in the pool. In practice, with a reasonably
-                        // large buffer pool it really shouldn't happen.
                         if iters > iter_limit {
+                            // NB: Even with the permits, there's no hard guarantee that we will find a slot with
+                            // any particular number of iterations: other threads might race ahead and acquire and
+                            // release pins just as we're scanning the array.
+                            //
+                            // Imagine that nslots is 2, and as starting point, usage_count==1 on all
+                            // slots. There are two threads running concurrently, A and B. A has just
+                            // acquired the permit from the semaphore.
+                            //
+                            //   A: Look at slot 1. Its usage_count == 1, so decrement it to zero, and continue the search
+                            //   B: Acquire permit.
+                            //   B: Look at slot 2, decrement its usage_count to zero and continue the search
+                            //   B: Look at slot 1. Its usage_count is zero, so pin it and bump up its usage_count to 1.
+                            //   B: Release pin and permit again
+                            //   B: Acquire permit.
+                            //   B: Look at slot 2. Its usage_count is zero, so pin it and bump up its usage_count to 1.
+                            //   B: Release pin and permit again
+                            //
+                            // Now we're back in the starting situation that both slots have
+                            // usage_count 1, but A has now been through one iteration of the
+                            // find_victim() loop. This can repeat indefinitely and on each
+                            // iteration, A's iteration count increases by one.
+                            //
+                            // So, even though the semaphore for the permits is fair, the victim search
+                            // itself happens in parallel and is not fair.
+                            // Hence even with a permit, a task can theoretically be starved.
+                            // To avoid this, we'd need tokio to give priority to tasks that are holding
+                            // permits for longer.
+                            // Note that just yielding to tokio during iteration without such
+                            // priority boosting is likely counter-productive. We'd just give more opportunities
+                            // for B to bump usage count, further starving A.
+                            crate::metrics::page_cache_errors_inc(
+                                crate::metrics::PageCacheErrorKind::EvictIterLimit,
+                            );
                             anyhow::bail!("exceeded evict iter limit");
                         }
                         continue;
@@ -799,6 +941,7 @@ impl PageCache {
                     self.remove_mapping(old_key);
                     inner.key = None;
                 }
+                crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
                 return Ok((slot_idx, inner));
             }
         }
@@ -826,7 +969,11 @@ impl PageCache {
                 let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();
 
                 Slot {
-                    inner: tokio::sync::RwLock::new(SlotInner { key: None, buf }),
+                    inner: tokio::sync::RwLock::new(SlotInner {
+                        key: None,
+                        buf,
+                        permit: std::sync::Mutex::new(Weak::new()),
+                    }),
                     usage_count: AtomicU8::new(0),
                 }
             })
@@ -838,6 +985,7 @@ impl PageCache {
             slots,
             next_evict_slot: AtomicUsize::new(0),
             size_metrics,
+            pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
         }
     }
 }
diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs
index 047fa761c3..7a94c3449d 100644
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -37,7 +37,7 @@ impl Key {
             | self.field6 as i128
     }
 
-    pub fn from_i128(x: i128) -> Self {
+    pub const fn from_i128(x: i128) -> Self {
         Key {
             field1: ((x >> 120) & 0xf) as u8,
             field2: ((x >> 104) & 0xFFFF) as u32,
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 650bc119b6..017322ffb2 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -456,7 +456,7 @@ async fn task_finish(
     }
 
     if shutdown_process {
-        shutdown_pageserver(1).await;
+        shutdown_pageserver(None, 1).await;
     }
 }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1c92c618fa..47bfd4a8ef 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -57,6 +57,7 @@ use self::timeline::EvictionTaskTenantState;
 use self::timeline::TimelineResources;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
+use crate::deletion_queue::DeletionQueueClient;
 use crate::import_datadir;
 use crate::is_uninit_mark;
 use crate::metrics::TENANT_ACTIVATION;
@@ -117,7 +118,7 @@ mod span;
 
 pub mod metadata;
 mod par_fsync;
-mod remote_timeline_client;
+pub mod remote_timeline_client;
 pub mod storage_layer;
 
 pub mod config;
@@ -157,6 +158,7 @@ pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
 pub struct TenantSharedResources {
     pub broker_client: storage_broker::BrokerClientChannel,
     pub remote_storage: Option<GenericRemoteStorage>,
+    pub deletion_queue_client: DeletionQueueClient,
 }
 
 ///
@@ -197,6 +199,9 @@ pub struct Tenant {
     // provides access to timeline data sitting in the remote storage
     pub(crate) remote_storage: Option<GenericRemoteStorage>,
 
+    // Access to global deletion queue for when this tenant wants to schedule a deletion
+    deletion_queue_client: DeletionQueueClient,
+
     /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
     cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
     cached_synthetic_tenant_size: Arc<AtomicU64>,
@@ -523,15 +528,20 @@ impl Tenant {
         conf: &'static PageServerConf,
         tenant_id: TenantId,
         generation: Generation,
-        broker_client: storage_broker::BrokerClientChannel,
+        resources: TenantSharedResources,
         tenants: &'static tokio::sync::RwLock<TenantsMap>,
-        remote_storage: GenericRemoteStorage,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Tenant>> {
         // TODO dedup with spawn_load
         let tenant_conf =
             Self::load_tenant_config(conf, &tenant_id).context("load tenant config")?;
 
+        let TenantSharedResources {
+            broker_client,
+            remote_storage,
+            deletion_queue_client,
+        } = resources;
+
         let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
         let tenant = Arc::new(Tenant::new(
             TenantState::Attaching,
@@ -540,7 +550,8 @@ impl Tenant {
             wal_redo_manager,
             tenant_id,
             generation,
-            Some(remote_storage.clone()),
+            remote_storage.clone(),
+            deletion_queue_client,
         ));
 
         // Do all the hard work in the background
@@ -571,7 +582,7 @@ impl Tenant {
                 let pending_deletion = {
                     match DeleteTenantFlow::should_resume_deletion(
                         conf,
-                        Some(&remote_storage),
+                        remote_storage.as_ref(),
                         &tenant_clone,
                     )
                     .await
@@ -660,6 +671,7 @@ impl Tenant {
         for timeline_id in remote_timeline_ids {
             let client = RemoteTimelineClient::new(
                 remote_storage.clone(),
+                self.deletion_queue_client.clone(),
                 self.conf,
                 self.tenant_id,
                 timeline_id,
@@ -726,6 +738,7 @@ impl Tenant {
                 remote_metadata,
                 TimelineResources {
                     remote_client: Some(remote_client),
+                    deletion_queue_client: self.deletion_queue_client.clone(),
                 },
                 ctx,
             )
@@ -750,6 +763,7 @@ impl Tenant {
                 timeline_id,
                 &index_part.metadata,
                 Some(remote_timeline_client),
+                self.deletion_queue_client.clone(),
                 None,
             )
             .await
@@ -851,6 +865,7 @@ impl Tenant {
             tenant_id,
             Generation::broken(),
             None,
+            DeletionQueueClient::broken(),
         ))
     }
 
@@ -895,6 +910,7 @@ impl Tenant {
             tenant_id,
             generation,
             remote_storage.clone(),
+            resources.deletion_queue_client.clone(),
         );
         let tenant = Arc::new(tenant);
 
@@ -1302,6 +1318,7 @@ impl Tenant {
                                 timeline_id,
                                 &local_metadata,
                                 Some(remote_client),
+                                self.deletion_queue_client.clone(),
                                 init_order,
                             )
                             .await
@@ -1351,6 +1368,7 @@ impl Tenant {
                         timeline_id,
                         &local_metadata,
                         None,
+                        self.deletion_queue_client.clone(),
                         init_order,
                     )
                     .await
@@ -2242,6 +2260,9 @@ impl Tenant {
         Ok(timeline)
     }
 
+    // Allow too_many_arguments because a constructor's argument list naturally grows with the
+    // number of attributes in the struct: breaking these out into a builder wouldn't be helpful.
+    #[allow(clippy::too_many_arguments)]
     fn new(
         state: TenantState,
         conf: &'static PageServerConf,
@@ -2250,6 +2271,7 @@ impl Tenant {
         tenant_id: TenantId,
         generation: Generation,
         remote_storage: Option<GenericRemoteStorage>,
+        deletion_queue_client: DeletionQueueClient,
     ) -> Tenant {
         let (state, mut rx) = watch::channel(state);
 
@@ -2317,6 +2339,7 @@ impl Tenant {
             gc_cs: tokio::sync::Mutex::new(()),
             walredo_mgr,
             remote_storage,
+            deletion_queue_client,
             state,
             cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
             cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
@@ -2856,6 +2879,7 @@ impl Tenant {
         let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
             let remote_client = RemoteTimelineClient::new(
                 remote_storage.clone(),
+                self.deletion_queue_client.clone(),
                 self.conf,
                 self.tenant_id,
                 timeline_id,
@@ -2866,7 +2890,10 @@ impl Tenant {
             None
         };
 
-        TimelineResources { remote_client }
+        TimelineResources {
+            remote_client,
+            deletion_queue_client: self.deletion_queue_client.clone(),
+        }
     }
 
     /// Creates intermediate timeline structure and its files.
@@ -3322,6 +3349,7 @@ pub mod harness {
     use utils::logging;
     use utils::lsn::Lsn;
 
+    use crate::deletion_queue::mock::MockDeletionQueue;
     use crate::{
         config::PageServerConf,
         repository::Key,
@@ -3383,6 +3411,7 @@ pub mod harness {
         pub generation: Generation,
         pub remote_storage: GenericRemoteStorage,
         pub remote_fs_dir: PathBuf,
+        pub deletion_queue: MockDeletionQueue,
     }
 
     static LOG_HANDLE: OnceCell<()> = OnceCell::new();
@@ -3431,6 +3460,7 @@ pub mod harness {
                 storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
             };
             let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
+            let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));
 
             Ok(Self {
                 conf,
@@ -3439,6 +3469,7 @@ pub mod harness {
                 generation: Generation::new(0xdeadbeef),
                 remote_storage,
                 remote_fs_dir,
+                deletion_queue,
             })
         }
 
@@ -3463,6 +3494,7 @@ pub mod harness {
                 self.tenant_id,
                 self.generation,
                 Some(self.remote_storage.clone()),
+                self.deletion_queue.new_client(),
             ));
             tenant
                 .load(None, ctx)
@@ -4193,7 +4225,8 @@ mod tests {
     //
     #[tokio::test]
     async fn test_bulk_insert() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_bulk_insert")?.load().await;
+        let harness = TenantHarness::create("test_bulk_insert")?;
+        let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -4240,7 +4273,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_random_updates() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_random_updates")?.load().await;
+        let harness = TenantHarness::create("test_random_updates")?;
+        let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 74faee1115..17bcc9eb5f 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -20,7 +20,10 @@ use utils::crashsafe;
 
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::control_plane_client::ControlPlaneClient;
+use crate::control_plane_client::{
+    ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
+};
+use crate::deletion_queue::DeletionQueueClient;
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::delete::DeleteTenantFlow;
@@ -116,7 +119,28 @@ pub async fn init_tenant_mgr(
 
     // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
     let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf, &cancel) {
-        Some(client.re_attach().await?)
+        let result = match client.re_attach().await {
+            Ok(tenants) => tenants,
+            Err(RetryForeverError::ShuttingDown) => {
+                anyhow::bail!("Shut down while waiting for control plane re-attach response")
+            }
+        };
+
+        // The deletion queue needs to know about the startup attachment state to decide which (if any) stored
+        // deletion list entries may still be valid.  We provide that by pushing a recovery operation into
+        // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
+        // are processed, even though we don't block on recovery completing here.
+        //
+        // Must only do this if remote storage is enabled, otherwise deletion queue
+        // is not running and channel push will fail.
+        if resources.remote_storage.is_some() {
+            resources
+                .deletion_queue_client
+                .recover(result.clone())
+                .await?;
+        }
+
+        Some(result)
     } else {
         info!("Control plane API not configured, tenant generations are disabled");
         None
@@ -285,29 +309,21 @@ pub(crate) fn schedule_local_tenant_processing(
 
     let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
         info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
-        if let Some(remote_storage) = resources.remote_storage {
-            match Tenant::spawn_attach(
-                conf,
-                tenant_id,
-                generation,
-                resources.broker_client,
-                tenants,
-                remote_storage,
-                ctx,
-            ) {
-                Ok(tenant) => tenant,
-                Err(e) => {
-                    error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
-                    Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
-                }
-            }
-        } else {
+        if resources.remote_storage.is_none() {
             warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
             Tenant::create_broken_tenant(
                 conf,
                 tenant_id,
                 "attaching mark file present but no remote storage configured".to_string(),
             )
+        } else {
+            match Tenant::spawn_attach(conf, tenant_id, generation, resources, tenants, ctx) {
+                Ok(tenant) => tenant,
+                Err(e) => {
+                    error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
+                    Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
+                }
+            }
         }
     } else {
         info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
@@ -438,8 +454,7 @@ pub async fn create_tenant(
     tenant_conf: TenantConfOpt,
     tenant_id: TenantId,
     generation: Generation,
-    broker_client: storage_broker::BrokerClientChannel,
-    remote_storage: Option<GenericRemoteStorage>,
+    resources: TenantSharedResources,
     ctx: &RequestContext,
 ) -> Result<Arc<Tenant>, TenantMapInsertError> {
     tenant_map_insert(tenant_id, || async {
@@ -450,13 +465,9 @@ pub async fn create_tenant(
         // TODO: tenant directory remains on disk if we bail out from here on.
         //       See https://github.com/neondatabase/neon/issues/4233
 
-        let tenant_resources = TenantSharedResources {
-            broker_client,
-            remote_storage,
-        };
         let created_tenant =
             schedule_local_tenant_processing(conf, tenant_id, &tenant_directory,
-                generation, tenant_resources, None, &TENANTS, ctx)?;
+                generation, resources, None, &TENANTS, ctx)?;
         // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
         //      See https://github.com/neondatabase/neon/issues/4233
 
@@ -622,6 +633,7 @@ pub async fn load_tenant(
     generation: Generation,
     broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
+    deletion_queue_client: DeletionQueueClient,
     ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
     tenant_map_insert(tenant_id, || async {
@@ -635,6 +647,7 @@ pub async fn load_tenant(
         let resources = TenantSharedResources {
             broker_client,
             remote_storage,
+            deletion_queue_client
         };
         let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, generation, resources, None,  &TENANTS, ctx)
             .with_context(|| {
@@ -702,8 +715,7 @@ pub async fn attach_tenant(
     tenant_id: TenantId,
     generation: Generation,
     tenant_conf: TenantConfOpt,
-    broker_client: storage_broker::BrokerClientChannel,
-    remote_storage: GenericRemoteStorage,
+    resources: TenantSharedResources,
     ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
     tenant_map_insert(tenant_id, || async {
@@ -718,10 +730,7 @@ pub async fn attach_tenant(
             .context("check for attach marker file existence")?;
         anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
 
-        let resources = TenantSharedResources {
-            broker_client,
-            remote_storage: Some(remote_storage),
-        };
+
         let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, generation, resources, None, &TENANTS, ctx)?;
         // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
         //      See https://github.com/neondatabase/neon/issues/4233
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 6f42b54ac2..ee99151ef2 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -116,8 +116,12 @@
 //! # Completion
 //!
 //! Once an operation has completed, we update
-//! [`UploadQueueInitialized::last_uploaded_consistent_lsn`] which indicates
-//! to safekeepers that they can delete the WAL up to that LSN.
+//! [`UploadQueueInitialized::projected_remote_consistent_lsn`] immediately,
+//! and submit a request through the DeletionQueue to update
+//! [`UploadQueueInitialized::visible_remote_consistent_lsn`] after it has
+//! validated that our generation is not stale.  It is this visible value
+//! that is advertized to safekeepers as a signal that that they can
+//! delete the WAL up to that LSN.
 //!
 //! The [`RemoteTimelineClient::wait_completion`] method can be used to wait
 //! for all pending operations to complete. It does not prevent more
@@ -200,7 +204,6 @@
 //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
 //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
 
-mod delete;
 mod download;
 pub mod index;
 mod upload;
@@ -226,6 +229,7 @@ use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;
 
+use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{
     MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
     RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
@@ -324,6 +328,8 @@ pub struct RemoteTimelineClient {
     metrics: Arc<RemoteTimelineClientMetrics>,
 
     storage_impl: GenericRemoteStorage,
+
+    deletion_queue_client: DeletionQueueClient,
 }
 
 impl RemoteTimelineClient {
@@ -335,6 +341,7 @@ impl RemoteTimelineClient {
     ///
     pub fn new(
         remote_storage: GenericRemoteStorage,
+        deletion_queue_client: DeletionQueueClient,
         conf: &'static PageServerConf,
         tenant_id: TenantId,
         timeline_id: TimelineId,
@@ -352,6 +359,7 @@ impl RemoteTimelineClient {
             timeline_id,
             generation,
             storage_impl: remote_storage,
+            deletion_queue_client,
             upload_queue: Mutex::new(UploadQueue::Uninitialized),
             metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
         }
@@ -413,13 +421,24 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
-    pub fn last_uploaded_consistent_lsn(&self) -> Option<Lsn> {
-        match &*self.upload_queue.lock().unwrap() {
+    pub fn remote_consistent_lsn_projected(&self) -> Option<Lsn> {
+        match &mut *self.upload_queue.lock().unwrap() {
             UploadQueue::Uninitialized => None,
-            UploadQueue::Initialized(q) => Some(q.last_uploaded_consistent_lsn),
-            UploadQueue::Stopped(q) => {
-                Some(q.upload_queue_for_deletion.last_uploaded_consistent_lsn)
-            }
+            UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(),
+            UploadQueue::Stopped(q) => q
+                .upload_queue_for_deletion
+                .get_last_remote_consistent_lsn_projected(),
+        }
+    }
+
+    pub fn remote_consistent_lsn_visible(&self) -> Option<Lsn> {
+        match &mut *self.upload_queue.lock().unwrap() {
+            UploadQueue::Uninitialized => None,
+            UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()),
+            UploadQueue::Stopped(q) => Some(
+                q.upload_queue_for_deletion
+                    .get_last_remote_consistent_lsn_visible(),
+            ),
         }
     }
 
@@ -434,11 +453,11 @@ impl RemoteTimelineClient {
         } else {
             0
         };
-        self.metrics.remote_physical_size_gauge().set(size);
+        self.metrics.remote_physical_size_set(size);
     }
 
     pub fn get_remote_physical_size(&self) -> u64 {
-        self.metrics.remote_physical_size_gauge().get()
+        self.metrics.remote_physical_size_get()
     }
 
     //
@@ -643,7 +662,7 @@ impl RemoteTimelineClient {
     /// successfully.
     pub fn schedule_layer_file_deletion(
         self: &Arc<Self>,
-        names: &[LayerFileName],
+        names: Vec<LayerFileName>,
     ) -> anyhow::Result<()> {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
@@ -663,10 +682,10 @@ impl RemoteTimelineClient {
             // Decorate our list of names with each name's generation, dropping
             // makes that are unexpectedly missing from our metadata.
             let with_generations: Vec<_> = names
-                .iter()
+                .into_iter()
                 .filter_map(|name| {
                     // Remove from latest_files, learning the file's remote generation in the process
-                    let meta = upload_queue.latest_files.remove(name);
+                    let meta = upload_queue.latest_files.remove(&name);
 
                     if let Some(meta) = meta {
                         upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
@@ -688,19 +707,17 @@ impl RemoteTimelineClient {
                 self.schedule_index_upload(upload_queue, metadata);
             }
 
-            // schedule the actual deletions
-            for (name, generation) in with_generations {
-                let op = UploadOp::Delete(Delete {
-                    file_kind: RemoteOpFileKind::Layer,
-                    layer_file_name: name.clone(),
-                    scheduled_from_timeline_delete: false,
-                    generation,
-                });
-                self.calls_unfinished_metric_begin(&op);
-                upload_queue.queued_operations.push_back(op);
-                info!("scheduled layer file deletion {name}");
+            for (name, gen) in &with_generations {
+                info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
             }
 
+            // schedule the actual deletions
+            let op = UploadOp::Delete(Delete {
+                layers: with_generations,
+            });
+            self.calls_unfinished_metric_begin(&op);
+            upload_queue.queued_operations.push_back(op);
+
             // Launch the tasks immediately, if possible
             self.launch_queued_tasks(upload_queue);
         };
@@ -833,9 +850,7 @@ impl RemoteTimelineClient {
     pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
-        let (mut receiver, deletions_queued) = {
-            let mut deletions_queued = 0;
-
+        let layers: Vec<RemotePath> = {
             let mut locked = self.upload_queue.lock().unwrap();
             let stopped = locked.stopped_mut()?;
 
@@ -847,42 +862,30 @@ impl RemoteTimelineClient {
 
             stopped
                 .upload_queue_for_deletion
-                .queued_operations
-                .reserve(stopped.upload_queue_for_deletion.latest_files.len());
-
-            // schedule the actual deletions
-            for (name, meta) in &stopped.upload_queue_for_deletion.latest_files {
-                let op = UploadOp::Delete(Delete {
-                    file_kind: RemoteOpFileKind::Layer,
-                    layer_file_name: name.clone(),
-                    scheduled_from_timeline_delete: true,
-                    generation: meta.generation,
-                });
-
-                self.calls_unfinished_metric_begin(&op);
-                stopped
-                    .upload_queue_for_deletion
-                    .queued_operations
-                    .push_back(op);
-
-                info!("scheduled layer file deletion {name}");
-                deletions_queued += 1;
-            }
-
-            self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion);
-
-            (
-                self.schedule_barrier(&mut stopped.upload_queue_for_deletion),
-                deletions_queued,
-            )
+                .latest_files
+                .drain()
+                .map(|(file_name, meta)| {
+                    remote_layer_path(
+                        &self.tenant_id,
+                        &self.timeline_id,
+                        &file_name,
+                        meta.generation,
+                    )
+                })
+                .collect()
         };
 
-        receiver.changed().await.context("upload queue shut down")?;
+        let layer_deletion_count = layers.len();
+        self.deletion_queue_client.push_immediate(layers).await?;
 
         // Do not delete index part yet, it is needed for possible retry. If we remove it first
         // and retry will arrive to different pageserver there wont be any traces of it on remote storage
         let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id);
 
+        // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
+        // taking the burden of listing all the layers that we already know we should delete.
+        self.deletion_queue_client.flush_immediate().await?;
+
         let remaining = backoff::retry(
             || async {
                 self.storage_impl
@@ -910,17 +913,9 @@ impl RemoteTimelineClient {
             })
             .collect();
 
+        let not_referenced_count = remaining.len();
         if !remaining.is_empty() {
-            backoff::retry(
-                || async { self.storage_impl.delete_objects(&remaining).await },
-                |_e| false,
-                FAILED_UPLOAD_WARN_THRESHOLD,
-                FAILED_REMOTE_OP_RETRIES,
-                "delete_objects",
-                backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
-            )
-            .await
-            .context("delete_objects")?;
+            self.deletion_queue_client.push_immediate(remaining).await?;
         }
 
         fail::fail_point!("timeline-delete-before-index-delete", |_| {
@@ -931,18 +926,14 @@ impl RemoteTimelineClient {
 
         let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
 
-        debug!("deleting index part");
+        debug!("enqueuing index part deletion");
+        self.deletion_queue_client
+            .push_immediate([index_file_path].to_vec())
+            .await?;
 
-        backoff::retry(
-            || async { self.storage_impl.delete(&index_file_path).await },
-            |_e| false,
-            FAILED_UPLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
-            "delete_index",
-            backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled")),
-        )
-        .await
-        .context("delete_index")?;
+        // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
+        // for a flush to a persistent deletion list so that we may be sure deletion will occur.
+        self.deletion_queue_client.flush_immediate().await?;
 
         fail::fail_point!("timeline-delete-after-index-delete", |_| {
             Err(anyhow::anyhow!(
@@ -950,7 +941,7 @@ impl RemoteTimelineClient {
             ))?
         });
 
-        info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");
+        info!(prefix=%timeline_storage_path, referenced=layer_deletion_count, not_referenced=%not_referenced_count, "done deleting in timeline prefix, including index_part.json");
 
         Ok(())
     }
@@ -1140,21 +1131,16 @@ impl RemoteTimelineClient {
                     }
                     res
                 }
-                UploadOp::Delete(delete) => {
-                    let path = &self
-                        .conf
-                        .timeline_path(&self.tenant_id, &self.timeline_id)
-                        .join(delete.layer_file_name.file_name());
-                    delete::delete_layer(self.conf, &self.storage_impl, path, delete.generation)
-                        .measure_remote_op(
-                            self.tenant_id,
-                            self.timeline_id,
-                            delete.file_kind,
-                            RemoteOpKind::Delete,
-                            Arc::clone(&self.metrics),
-                        )
-                        .await
-                }
+                UploadOp::Delete(delete) => self
+                    .deletion_queue_client
+                    .push_layers(
+                        self.tenant_id,
+                        self.timeline_id,
+                        self.generation,
+                        delete.layers.clone(),
+                    )
+                    .await
+                    .map_err(|e| anyhow::anyhow!(e)),
                 UploadOp::Barrier(_) => {
                     // unreachable. Barrier operations are handled synchronously in
                     // launch_queued_tasks
@@ -1210,18 +1196,12 @@ impl RemoteTimelineClient {
         }
 
         // The task has completed successfully. Remove it from the in-progress list.
-        {
+        let lsn_update = {
             let mut upload_queue_guard = self.upload_queue.lock().unwrap();
             let upload_queue = match upload_queue_guard.deref_mut() {
                 UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"),
-                UploadQueue::Stopped(stopped) => {
-                    // Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion)
-                    // then stop() took care of it so we just return.
-                    // For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc.
-                    match &task.op {
-                        UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion),
-                        _ => None
-                    }
+                UploadQueue::Stopped(_stopped) => {
+                    None
                 },
                 UploadQueue::Initialized(qi) => { Some(qi) }
             };
@@ -1236,23 +1216,51 @@ impl RemoteTimelineClient {
 
             upload_queue.inprogress_tasks.remove(&task.task_id);
 
-            match task.op {
+            let lsn_update = match task.op {
                 UploadOp::UploadLayer(_, _) => {
                     upload_queue.num_inprogress_layer_uploads -= 1;
+                    None
                 }
                 UploadOp::UploadMetadata(_, lsn) => {
                     upload_queue.num_inprogress_metadata_uploads -= 1;
-                    upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check?
+                    // XXX monotonicity check?
+
+                    upload_queue.projected_remote_consistent_lsn = Some(lsn);
+                    if self.generation.is_none() {
+                        // Legacy mode: skip validating generation
+                        upload_queue.visible_remote_consistent_lsn.store(lsn);
+                        None
+                    } else {
+                        Some((lsn, upload_queue.visible_remote_consistent_lsn.clone()))
+                    }
                 }
                 UploadOp::Delete(_) => {
                     upload_queue.num_inprogress_deletions -= 1;
+                    None
                 }
                 UploadOp::Barrier(_) => unreachable!(),
             };
 
             // Launch any queued tasks that were unblocked by this one.
             self.launch_queued_tasks(upload_queue);
+            lsn_update
+        };
+
+        if let Some((lsn, slot)) = lsn_update {
+            // Updates to the remote_consistent_lsn we advertise to pageservers
+            // are all routed through the DeletionQueue, to enforce important
+            // data safety guarantees (see docs/rfcs/025-generation-numbers.md)
+            self.deletion_queue_client
+                .update_remote_consistent_lsn(
+                    self.tenant_id,
+                    self.timeline_id,
+                    self.generation,
+                    lsn,
+                    slot,
+                )
+                .await;
         }
+
         self.calls_unfinished_metric_end(&task.op);
     }
 
@@ -1278,8 +1286,8 @@ impl RemoteTimelineClient {
                     reason: "metadata uploads are tiny",
                 },
             ),
-            UploadOp::Delete(delete) => (
-                delete.file_kind,
+            UploadOp::Delete(_delete) => (
+                RemoteOpFileKind::Layer,
                 RemoteOpKind::Delete,
                 DontTrackSize {
                     reason: "should we track deletes? positive or negative sign?",
@@ -1341,7 +1349,10 @@ impl RemoteTimelineClient {
                         latest_files: initialized.latest_files.clone(),
                         latest_files_changes_since_metadata_upload_scheduled: 0,
                         latest_metadata: initialized.latest_metadata.clone(),
-                        last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn,
+                        projected_remote_consistent_lsn: None,
+                        visible_remote_consistent_lsn: initialized
+                            .visible_remote_consistent_lsn
+                            .clone(),
                         num_inprogress_layer_uploads: 0,
                         num_inprogress_metadata_uploads: 0,
                         num_inprogress_deletions: 0,
@@ -1405,13 +1416,13 @@ pub fn remote_layer_path(
     tenant_id: &TenantId,
     timeline_id: &TimelineId,
     layer_file_name: &LayerFileName,
-    layer_meta: &LayerFileMetadata,
+    generation: Generation,
 ) -> RemotePath {
     // Generation-aware key format
     let path = format!(
         "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
         layer_file_name.file_name(),
-        layer_meta.generation.get_suffix()
+        generation.get_suffix()
     );
 
     RemotePath::from_string(&path).expect("Failed to construct path")
@@ -1554,7 +1565,6 @@ mod tests {
 
     impl TestSetup {
         async fn new(test_name: &str) -> anyhow::Result<Self> {
-            // Use a current-thread runtime in the test
             let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
             let harness = TenantHarness::create(test_name)?;
             let (tenant, ctx) = harness.load().await;
@@ -1580,6 +1590,7 @@ mod tests {
                 timeline_id: TIMELINE_ID,
                 generation,
                 storage_impl: self.harness.remote_storage.clone(),
+                deletion_queue_client: self.harness.deletion_queue.new_client(),
                 upload_queue: Mutex::new(UploadQueue::Uninitialized),
                 metrics: Arc::new(RemoteTimelineClientMetrics::new(
                     &self.harness.tenant_id,
@@ -1749,7 +1760,7 @@ mod tests {
             )
             .unwrap();
         client
-            .schedule_layer_file_deletion(&[layer_file_name_1.clone()])
+            .schedule_layer_file_deletion([layer_file_name_1.clone()].to_vec())
             .unwrap();
         {
             let mut guard = client.upload_queue.lock().unwrap();
@@ -1775,6 +1786,7 @@ mod tests {
 
         // Finish them
         client.wait_completion().await.unwrap();
+        harness.deletion_queue.pump().await;
 
         assert_remote_files(
             &[
diff --git a/pageserver/src/tenant/remote_timeline_client/delete.rs b/pageserver/src/tenant/remote_timeline_client/delete.rs
deleted file mode 100644
index 7324559223..0000000000
--- a/pageserver/src/tenant/remote_timeline_client/delete.rs
+++ /dev/null
@@ -1,34 +0,0 @@
-//! Helper functions to delete files from remote storage with a RemoteStorage
-use anyhow::Context;
-use std::path::Path;
-use tracing::debug;
-
-use remote_storage::GenericRemoteStorage;
-
-use crate::{
-    config::PageServerConf,
-    tenant::{remote_timeline_client::remote_path, Generation},
-};
-
-pub(super) async fn delete_layer<'a>(
-    conf: &'static PageServerConf,
-    storage: &'a GenericRemoteStorage,
-    local_layer_path: &'a Path,
-    generation: Generation,
-) -> anyhow::Result<()> {
-    fail::fail_point!("before-delete-layer", |_| {
-        anyhow::bail!("failpoint before-delete-layer")
-    });
-    debug!("Deleting layer from remote storage: {local_layer_path:?}",);
-
-    let path_to_delete = remote_path(conf, local_layer_path, generation)?;
-
-    // We don't want to print an error if the delete failed if the file has
-    // already been deleted. Thankfully, in this situation S3 already
-    // does not yield an error. While OS-provided local file system APIs do yield
-    // errors, we avoid them in the `LocalFs` wrapper.
-    storage
-        .delete(&path_to_delete)
-        .await
-        .with_context(|| format!("delete remote layer from storage at {path_to_delete:?}"))
-}
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 9863215529..5c173c613f 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -50,7 +50,12 @@ pub async fn download_layer_file<'a>(
         .timeline_path(&tenant_id, &timeline_id)
         .join(layer_file_name.file_name());
 
-    let remote_path = remote_layer_path(&tenant_id, &timeline_id, layer_file_name, layer_metadata);
+    let remote_path = remote_layer_path(
+        &tenant_id,
+        &timeline_id,
+        layer_file_name,
+        layer_metadata.generation,
+    );
 
     // Perform a rename inspired by durable_rename from file_utils.c.
     // The sequence:
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 78ac1338db..9b62ba1c50 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -38,6 +38,7 @@ use std::time::{Duration, Instant, SystemTime};
 use crate::context::{
     AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
 };
+use crate::deletion_queue::DeletionQueueClient;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
@@ -143,6 +144,7 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
     pub remote_client: Option<RemoteTimelineClient>,
+    pub deletion_queue_client: DeletionQueueClient,
 }
 
 pub struct Timeline {
@@ -521,9 +523,23 @@ impl Timeline {
         self.disk_consistent_lsn.load()
     }
 
-    pub fn get_remote_consistent_lsn(&self) -> Option<Lsn> {
+    /// remote_consistent_lsn from the perspective of the tenant's current generation,
+    /// not validated with control plane yet.
+    /// See [`Self::get_remote_consistent_lsn_visible`].
+    pub fn get_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
         if let Some(remote_client) = &self.remote_client {
-            remote_client.last_uploaded_consistent_lsn()
+            remote_client.remote_consistent_lsn_projected()
+        } else {
+            None
+        }
+    }
+
+    /// remote_consistent_lsn which the tenant is guaranteed not to go backward from,
+    /// i.e. a value of remote_consistent_lsn_projected which has undergone
+    /// generation validation in the deletion queue.
+    pub fn get_remote_consistent_lsn_visible(&self) -> Option<Lsn> {
+        if let Some(remote_client) = &self.remote_client {
+            remote_client.remote_consistent_lsn_visible()
         } else {
             None
         }
@@ -543,7 +559,7 @@ impl Timeline {
     }
 
     pub fn resident_physical_size(&self) -> u64 {
-        self.metrics.resident_physical_size_gauge.get()
+        self.metrics.resident_physical_size_get()
     }
 
     ///
@@ -1293,10 +1309,7 @@ impl Timeline {
         // will treat the file as a local layer again, count it towards resident size,
         // and it'll be like the layer removal never happened.
         // The bump in resident size is perhaps unexpected but overall a robust behavior.
-        self.metrics
-            .resident_physical_size_gauge
-            .sub(layer_file_size);
-
+        self.metrics.resident_physical_size_sub(layer_file_size);
         self.metrics.evictions.inc();
 
         if let Some(delta) = local_layer_residence_duration {
@@ -1820,7 +1833,7 @@ impl Timeline {
             for (layer, m) in needs_upload {
                 rtc.schedule_layer_file_upload(&layer.layer_desc().filename(), &m)?;
             }
-            rtc.schedule_layer_file_deletion(&needs_cleanup)?;
+            rtc.schedule_layer_file_deletion(needs_cleanup)?;
             rtc.schedule_index_upload_for_file_changes()?;
             // Tenant::create_timeline will wait for these uploads to happen before returning, or
             // on retry.
@@ -1830,9 +1843,7 @@ impl Timeline {
             "loaded layer map with {} layers at {}, total physical size: {}",
             num_layers, disk_consistent_lsn, total_physical_size
         );
-        self.metrics
-            .resident_physical_size_gauge
-            .set(total_physical_size);
+        self.metrics.resident_physical_size_set(total_physical_size);
 
         timer.stop_and_record();
         Ok(())
@@ -3875,7 +3886,7 @@ impl Timeline {
 
         // Also schedule the deletions in remote storage
         if let Some(remote_client) = &self.remote_client {
-            remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
+            remote_client.schedule_layer_file_deletion(layer_names_to_delete)?;
         }
 
         Ok(())
@@ -4210,7 +4221,7 @@ impl Timeline {
             }
 
             if let Some(remote_client) = &self.remote_client {
-                remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
+                remote_client.schedule_layer_file_deletion(layer_names_to_delete)?;
             }
 
             apply.flush();
@@ -4382,7 +4393,7 @@ impl Timeline {
 
                     // XXX the temp file is still around in Err() case
                     // and consumes space until we clean up upon pageserver restart.
-                    self_clone.metrics.resident_physical_size_gauge.add(*size);
+                    self_clone.metrics.resident_physical_size_add(*size);
 
                     // Download complete. Replace the RemoteLayer with the corresponding
                     // Delta- or ImageLayer in the layer map.
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 18588cf0fd..7d55388f44 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -14,6 +14,7 @@ use utils::{
 
 use crate::{
     config::PageServerConf,
+    deletion_queue::DeletionQueueClient,
     task_mgr::{self, TaskKind},
     tenant::{
         metadata::TimelineMetadata,
@@ -407,6 +408,7 @@ impl DeleteTimelineFlow {
         timeline_id: TimelineId,
         local_metadata: &TimelineMetadata,
         remote_client: Option<RemoteTimelineClient>,
+        deletion_queue_client: DeletionQueueClient,
         init_order: Option<&InitializationOrder>,
     ) -> anyhow::Result<()> {
         // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
@@ -416,7 +418,10 @@ impl DeleteTimelineFlow {
                 timeline_id,
                 local_metadata,
                 None, // Ancestor is not needed for deletion.
-                TimelineResources { remote_client },
+                TimelineResources {
+                    remote_client,
+                    deletion_queue_client,
+                },
                 init_order,
                 // Important. We dont pass ancestor above because it can be missing.
                 // Thus we need to skip the validation here.
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 3c88d31f24..0a387bd779 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -263,7 +263,7 @@ impl LayerManager {
         let desc = layer.layer_desc();
         if !layer.is_remote_layer() {
             layer.delete_resident_layer_file()?;
-            metrics.resident_physical_size_gauge.sub(desc.file_size);
+            metrics.resident_physical_size_sub(desc.file_size);
         }
 
         // TODO Removing from the bottom of the layer map is expensive.
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 7d1e9b4a39..0831b9ceda 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -370,8 +370,9 @@ pub(super) async fn handle_walreceiver_connection(
             })?;
 
         if let Some(last_lsn) = status_update {
-            let timeline_remote_consistent_lsn =
-                timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
+            let timeline_remote_consistent_lsn = timeline
+                .get_remote_consistent_lsn_visible()
+                .unwrap_or(Lsn(0));
 
             // The last LSN we processed. It is not guaranteed to survive pageserver crash.
             let last_received_lsn = last_lsn;
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index 28822335b0..08b1cb8866 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -1,5 +1,3 @@
-use crate::metrics::RemoteOpFileKind;
-
 use super::storage_layer::LayerFileName;
 use super::Generation;
 use crate::tenant::metadata::TimelineMetadata;
@@ -11,6 +9,7 @@ use std::fmt::Debug;
 use chrono::NaiveDateTime;
 use std::sync::Arc;
 use tracing::info;
+use utils::lsn::AtomicLsn;
 
 use std::sync::atomic::AtomicU32;
 use utils::lsn::Lsn;
@@ -58,7 +57,12 @@ pub(crate) struct UploadQueueInitialized {
     /// uploaded. `Lsn(0)` if nothing was uploaded yet.
     /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
     /// Safekeeper can rely on it to make decisions for WAL storage.
-    pub(crate) last_uploaded_consistent_lsn: Lsn,
+    ///
+    /// visible_remote_consistent_lsn is only updated after our generation has been validated with
+    /// the control plane (unlesss a timeline's generation is None, in which case
+    /// we skip validation)
+    pub(crate) projected_remote_consistent_lsn: Option<Lsn>,
+    pub(crate) visible_remote_consistent_lsn: Arc<AtomicLsn>,
 
     // Breakdown of different kinds of tasks currently in-progress
     pub(crate) num_inprogress_layer_uploads: usize,
@@ -81,6 +85,14 @@ impl UploadQueueInitialized {
     pub(super) fn no_pending_work(&self) -> bool {
         self.inprogress_tasks.is_empty() && self.queued_operations.is_empty()
     }
+
+    pub(super) fn get_last_remote_consistent_lsn_visible(&self) -> Lsn {
+        self.visible_remote_consistent_lsn.load()
+    }
+
+    pub(super) fn get_last_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
+        self.projected_remote_consistent_lsn
+    }
 }
 
 #[derive(Clone, Copy)]
@@ -114,9 +126,8 @@ impl UploadQueue {
             latest_files: HashMap::new(),
             latest_files_changes_since_metadata_upload_scheduled: 0,
             latest_metadata: metadata.clone(),
-            // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
-            // safekeepers from garbage-collecting anything.
-            last_uploaded_consistent_lsn: Lsn(0),
+            projected_remote_consistent_lsn: None,
+            visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
             // what follows are boring default initializations
             task_counter: 0,
             num_inprogress_layer_uploads: 0,
@@ -158,7 +169,10 @@ impl UploadQueue {
             latest_files: files,
             latest_files_changes_since_metadata_upload_scheduled: 0,
             latest_metadata: index_part.metadata.clone(),
-            last_uploaded_consistent_lsn: index_part.metadata.disk_consistent_lsn(),
+            projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()),
+            visible_remote_consistent_lsn: Arc::new(
+                index_part.metadata.disk_consistent_lsn().into(),
+            ),
             // what follows are boring default initializations
             task_counter: 0,
             num_inprogress_layer_uploads: 0,
@@ -201,12 +215,11 @@ pub(crate) struct UploadTask {
     pub(crate) op: UploadOp,
 }
 
+/// A deletion of some layers within the lifetime of a timeline.  This is not used
+/// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
 #[derive(Debug)]
 pub(crate) struct Delete {
-    pub(crate) file_kind: RemoteOpFileKind,
-    pub(crate) layer_file_name: LayerFileName,
-    pub(crate) scheduled_from_timeline_delete: bool,
-    pub(crate) generation: Generation,
+    pub(crate) layers: Vec<(LayerFileName, Generation)>,
 }
 
 #[derive(Debug)]
@@ -217,7 +230,7 @@ pub(crate) enum UploadOp {
     /// Upload the metadata file
     UploadMetadata(IndexPart, Lsn),
 
-    /// Delete a layer file
+    /// Delete layer files
     Delete(Delete),
 
     /// Barrier. When the barrier operation is reached,
@@ -239,13 +252,9 @@ impl std::fmt::Display for UploadOp {
             UploadOp::UploadMetadata(_, lsn) => {
                 write!(f, "UploadMetadata(lsn: {})", lsn)
             }
-            UploadOp::Delete(delete) => write!(
-                f,
-                "Delete(path: {}, scheduled_from_timeline_delete: {}, gen: {:?})",
-                delete.layer_file_name.file_name(),
-                delete.scheduled_from_timeline_delete,
-                delete.generation
-            ),
+            UploadOp::Delete(delete) => {
+                write!(f, "Delete({} layers)", delete.layers.len(),)
+            }
             UploadOp::Barrier(_) => write!(f, "Barrier"),
         }
     }
diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
index debbbce117..8b0035b8e8 100644
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -741,6 +741,13 @@ NeonProcessUtility(
 			break;
 		case T_DropdbStmt:
 			HandleDropDb(castNode(DropdbStmt, parseTree));
+			/*
+			 * We do this here to hack around the fact that Postgres performs the drop
+			 * INSIDE of standard_ProcessUtility, which means that if we try to
+			 * abort the drop normally it'll be too late. DROP DATABASE can't be inside
+			 * of a transaction block anyway, so this should be fine to do.
+			 */
+			NeonXactCallback(XACT_EVENT_PRE_COMMIT, NULL);
 			break;
 		case T_CreateRoleStmt:
 			HandleCreateRole(castNode(CreateRoleStmt, parseTree));
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 4be75e1dad..22f20a4c0b 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -14,7 +14,6 @@
  */
 
 #include <sys/file.h>
-#include <sys/statvfs.h>
 #include <unistd.h>
 #include <fcntl.h>
 
@@ -38,9 +37,6 @@
 #include "storage/fd.h"
 #include "storage/pg_shmem.h"
 #include "storage/buf_internals.h"
-#include "storage/procsignal.h"
-#include "postmaster/bgworker.h"
-#include "postmaster/interrupt.h"
 
 /*
  * Local file cache is used to temporary store relations pages in local file system.
@@ -66,9 +62,6 @@
 
 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
 
-#define MAX_MONITOR_INTERVAL_USEC 1000000 /* 1 second */
-#define MAX_DISK_WRITE_RATE       1000 /* MB/sec */
-
 typedef struct FileCacheEntry
 {
 	BufferTag	key;
@@ -91,14 +84,12 @@ static int   lfc_desc = 0;
 static LWLockId lfc_lock;
 static int   lfc_max_size;
 static int   lfc_size_limit;
-static int   lfc_free_space_watermark;
 static char* lfc_path;
 static  FileCacheControl* lfc_ctl;
 static shmem_startup_hook_type prev_shmem_startup_hook;
 #if PG_VERSION_NUM>=150000
 static shmem_request_hook_type prev_shmem_request_hook;
 #endif
-static int   lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark is reached */
 
 void FileCacheMonitorMain(Datum main_arg);
 
@@ -254,80 +245,6 @@ lfc_change_limit_hook(int newval, void *extra)
 	LWLockRelease(lfc_lock);
 }
 
-/*
- * Local file system state monitor check available free space.
- * If it is lower than lfc_free_space_watermark then we shrink size of local cache
- * but throwing away least recently accessed chunks.
- * First time low space watermark is reached cache size is divided by two,
- * second time by four,... Finally we remove all chunks from local cache.
- *
- * Please notice that we are not changing lfc_cache_size: it is used to be adjusted by autoscaler.
- * We only throw away cached chunks but do not prevent from filling cache by new chunks.
- *
- * Interval of poooling cache state is calculated as minimal time needed to consume lfc_free_space_watermark
- * disk space with maximal possible disk write speed (1Gb/sec). But not larger than 1 second.
- * Calling statvfs each second should not add any noticeable overhead.
- */
-void
-FileCacheMonitorMain(Datum main_arg)
-{
-	/*
-	 * Choose file system state monitor interval so that space can not be exosted
-	 * during this period but not longer than  MAX_MONITOR_INTERVAL (10 sec)
-	 */
-	uint64 monitor_interval = Min(MAX_MONITOR_INTERVAL_USEC, lfc_free_space_watermark*MB/MAX_DISK_WRITE_RATE);
-
-	/* Establish signal handlers. */
-	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
-	pqsignal(SIGHUP, SignalHandlerForConfigReload);
-	pqsignal(SIGTERM, SignalHandlerForShutdownRequest);
-	BackgroundWorkerUnblockSignals();
-
-	/* Periodically dump buffers until terminated. */
-	while (!ShutdownRequestPending)
-	{
-		if (lfc_size_limit != 0)
-		{
-			struct statvfs sfs;
-			if (statvfs(lfc_path, &sfs) < 0)
-			{
-				elog(WARNING, "Failed to obtain status of %s: %m", lfc_path);
-			}
-			else
-			{
-				if (sfs.f_bavail*sfs.f_bsize < lfc_free_space_watermark*MB)
-				{
-					if (lfc_shrinking_factor < 31) {
-						lfc_shrinking_factor += 1;
-					}
-					lfc_change_limit_hook(lfc_size_limit >> lfc_shrinking_factor, NULL);
-				}
-				else
-					lfc_shrinking_factor = 0; /* reset to initial value */
-			}
-		}
-		pg_usleep(monitor_interval);
-	}
-}
-
-static void
-lfc_register_free_space_monitor(void)
-{
-	BackgroundWorker bgw;
-	memset(&bgw, 0, sizeof(bgw));
-	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
-	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
-	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
-	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "FileCacheMonitorMain");
-	snprintf(bgw.bgw_name, BGW_MAXLEN, "Local free space monitor");
-	snprintf(bgw.bgw_type, BGW_MAXLEN, "Local free space monitor");
-	bgw.bgw_restart_time = 5;
-	bgw.bgw_notify_pid = 0;
-	bgw.bgw_main_arg = (Datum) 0;
-
-	RegisterBackgroundWorker(&bgw);
-}
-
 void
 lfc_init(void)
 {
@@ -364,19 +281,6 @@ lfc_init(void)
 							lfc_change_limit_hook,
 							NULL);
 
-	DefineCustomIntVariable("neon.free_space_watermark",
-							"Minimal free space in local file system after reaching which local file cache will be truncated",
-							NULL,
-							&lfc_free_space_watermark,
-							1024, /* 1GB */
-							0,
-							INT_MAX,
-							PGC_SIGHUP,
-							GUC_UNIT_MB,
-							NULL,
-							NULL,
-							NULL);
-
 	DefineCustomStringVariable("neon.file_cache_path",
 							   "Path to local file cache (can be raw device)",
 							   NULL,
@@ -391,9 +295,6 @@ lfc_init(void)
 	if (lfc_max_size == 0)
 		return;
 
-	if (lfc_free_space_watermark != 0)
-		lfc_register_free_space_monitor();
-
 	prev_shmem_startup_hook = shmem_startup_hook;
 	shmem_startup_hook = lfc_shmem_startup;
 #if PG_VERSION_NUM>=150000
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index cbab0c6f07..92498d3ecd 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -42,6 +42,7 @@ reqwest-middleware.workspace = true
 reqwest-retry.workspace = true
 reqwest-tracing.workspace = true
 routerify.workspace = true
+rustc-hash.workspace = true
 rustls-pemfile.workspace = true
 rustls.workspace = true
 scopeguard.workspace = true
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index ff73f2b625..03c9029862 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -160,6 +160,19 @@ impl BackendType<'_, ClientCredentials<'_>> {
             Test(_) => Some("test".to_owned()),
         }
     }
+
+    /// Get username from the credentials.
+    pub fn get_user(&self) -> &str {
+        use BackendType::*;
+
+        match self {
+            Console(_, creds) => creds.user,
+            Postgres(_, creds) => creds.user,
+            Link(_) => "link",
+            Test(_) => "test",
+        }
+    }
+
     /// Authenticate the client via the requested backend, possibly using credentials.
     #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
     pub async fn authenticate(
diff --git a/proxy/src/http/conn_pool.rs b/proxy/src/http/conn_pool.rs
index e771e5d7ed..a7ef15d342 100644
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -17,11 +17,12 @@ use std::{
 use tokio::time;
 use tokio_postgres::AsyncMessage;
 
-use crate::{auth, console};
+use crate::{
+    auth, console,
+    metrics::{Ids, MetricCounter, USAGE_METRICS},
+};
 use crate::{compute, config};
 
-use super::sql_over_http::MAX_RESPONSE_SIZE;
-
 use crate::proxy::ConnectMechanism;
 
 use tracing::{error, warn};
@@ -400,7 +401,6 @@ async fn connect_to_compute_once(
         .user(&conn_info.username)
         .password(&conn_info.password)
         .dbname(&conn_info.dbname)
-        .max_backend_message_size(MAX_RESPONSE_SIZE)
         .connect_timeout(timeout)
         .connect(tokio_postgres::NoTls)
         .await?;
@@ -412,6 +412,10 @@ async fn connect_to_compute_once(
     span.in_scope(|| {
         info!(%conn_info, %session, "new connection");
     });
+    let ids = Ids {
+        endpoint_id: node_info.aux.endpoint_id.to_string(),
+        branch_id: node_info.aux.branch_id.to_string(),
+    };
 
     tokio::spawn(
         poll_fn(move |cx| {
@@ -450,10 +454,18 @@ async fn connect_to_compute_once(
     Ok(Client {
         inner: client,
         session: tx,
+        ids,
     })
 }
 
 pub struct Client {
     pub inner: tokio_postgres::Client,
     session: tokio::sync::watch::Sender<uuid::Uuid>,
+    ids: Ids,
+}
+
+impl Client {
+    pub fn metrics(&self) -> Arc<MetricCounter> {
+        USAGE_METRICS.register(self.ids.clone())
+    }
 }
diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs
index fe57096105..b74b3e9646 100644
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -3,10 +3,12 @@ use std::sync::Arc;
 use anyhow::bail;
 use futures::pin_mut;
 use futures::StreamExt;
-use hashbrown::HashMap;
 use hyper::body::HttpBody;
+use hyper::header;
 use hyper::http::HeaderName;
 use hyper::http::HeaderValue;
+use hyper::Response;
+use hyper::StatusCode;
 use hyper::{Body, HeaderMap, Request};
 use serde_json::json;
 use serde_json::Map;
@@ -16,7 +18,11 @@ use tokio_postgres::types::Type;
 use tokio_postgres::GenericClient;
 use tokio_postgres::IsolationLevel;
 use tokio_postgres::Row;
+use tracing::error;
+use tracing::instrument;
 use url::Url;
+use utils::http::error::ApiError;
+use utils::http::json::json_response;
 
 use super::conn_pool::ConnInfo;
 use super::conn_pool::GlobalConnPool;
@@ -39,7 +45,6 @@ enum Payload {
     Batch(BatchQueryData),
 }
 
-pub const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MB
 const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
 
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
@@ -182,7 +187,45 @@ pub async fn handle(
     sni_hostname: Option<String>,
     conn_pool: Arc<GlobalConnPool>,
     session_id: uuid::Uuid,
-) -> anyhow::Result<(Value, HashMap<HeaderName, HeaderValue>)> {
+) -> Result<Response<Body>, ApiError> {
+    let result = handle_inner(request, sni_hostname, conn_pool, session_id).await;
+
+    let mut response = match result {
+        Ok(r) => r,
+        Err(e) => {
+            let message = format!("{:?}", e);
+            let code = match e.downcast_ref::<tokio_postgres::Error>() {
+                Some(e) => match e.code() {
+                    Some(e) => serde_json::to_value(e.code()).unwrap(),
+                    None => Value::Null,
+                },
+                None => Value::Null,
+            };
+            error!(
+                ?code,
+                "sql-over-http per-client task finished with an error: {e:#}"
+            );
+            // TODO: this shouldn't always be bad request.
+            json_response(
+                StatusCode::BAD_REQUEST,
+                json!({ "message": message, "code": code }),
+            )?
+        }
+    };
+    response.headers_mut().insert(
+        "Access-Control-Allow-Origin",
+        hyper::http::HeaderValue::from_static("*"),
+    );
+    Ok(response)
+}
+
+#[instrument(name = "sql-over-http", skip_all)]
+async fn handle_inner(
+    request: Request<Body>,
+    sni_hostname: Option<String>,
+    conn_pool: Arc<GlobalConnPool>,
+    session_id: uuid::Uuid,
+) -> anyhow::Result<Response<Body>> {
     //
     // Determine the destination and connection params
     //
@@ -233,13 +276,18 @@ pub async fn handle(
 
     let mut client = conn_pool.get(&conn_info, !allow_pool, session_id).await?;
 
+    let mut response = Response::builder()
+        .status(StatusCode::OK)
+        .header(header::CONTENT_TYPE, "application/json");
+
     //
     // Now execute the query and return the result
     //
+    let mut size = 0;
     let result = match payload {
-        Payload::Single(query) => query_to_json(&client.inner, query, raw_output, array_mode)
-            .await
-            .map(|x| (x, HashMap::default())),
+        Payload::Single(query) => {
+            query_to_json(&client.inner, query, &mut size, raw_output, array_mode).await
+        }
         Payload::Batch(batch_query) => {
             let mut results = Vec::new();
             let mut builder = client.inner.build_transaction();
@@ -254,7 +302,8 @@ pub async fn handle(
             }
             let transaction = builder.start().await?;
             for query in batch_query.queries {
-                let result = query_to_json(&transaction, query, raw_output, array_mode).await;
+                let result =
+                    query_to_json(&transaction, query, &mut size, raw_output, array_mode).await;
                 match result {
                     Ok(r) => results.push(r),
                     Err(e) => {
@@ -264,26 +313,27 @@ pub async fn handle(
                 }
             }
             transaction.commit().await?;
-            let mut headers = HashMap::default();
             if txn_read_only {
-                headers.insert(
+                response = response.header(
                     TXN_READ_ONLY.clone(),
                     HeaderValue::try_from(txn_read_only.to_string())?,
                 );
             }
             if txn_deferrable {
-                headers.insert(
+                response = response.header(
                     TXN_DEFERRABLE.clone(),
                     HeaderValue::try_from(txn_deferrable.to_string())?,
                 );
             }
             if let Some(txn_isolation_level) = txn_isolation_level_raw {
-                headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
+                response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
             }
-            Ok((json!({ "results": results }), headers))
+            Ok(json!({ "results": results }))
         }
     };
 
+    let metrics = client.metrics();
+
     if allow_pool {
         let current_span = tracing::Span::current();
         // return connection to the pool
@@ -293,12 +343,30 @@ pub async fn handle(
         });
     }
 
-    result
+    match result {
+        Ok(value) => {
+            // how could this possibly fail
+            let body = serde_json::to_string(&value).expect("json serialization should not fail");
+            let len = body.len();
+            let response = response
+                .body(Body::from(body))
+                // only fails if invalid status code or invalid header/values are given.
+                // these are not user configurable so it cannot fail dynamically
+                .expect("building response payload should not fail");
+
+            // count the egress bytes - we miss the TLS and header overhead but oh well...
+            // moving this later in the stack is going to be a lot of effort and ehhhh
+            metrics.record_egress(len as u64);
+            Ok(response)
+        }
+        Err(e) => Err(e),
+    }
 }
 
 async fn query_to_json<T: GenericClient>(
     client: &T,
     data: QueryData,
+    current_size: &mut usize,
     raw_output: bool,
     array_mode: bool,
 ) -> anyhow::Result<Value> {
@@ -312,16 +380,10 @@ async fn query_to_json<T: GenericClient>(
     // big.
     pin_mut!(row_stream);
     let mut rows: Vec<tokio_postgres::Row> = Vec::new();
-    let mut current_size = 0;
     while let Some(row) = row_stream.next().await {
         let row = row?;
-        current_size += row.body_len();
+        *current_size += row.body_len();
         rows.push(row);
-        if current_size > MAX_RESPONSE_SIZE {
-            return Err(anyhow::anyhow!(
-                "response is too large (max is {MAX_RESPONSE_SIZE} bytes)"
-            ));
-        }
     }
 
     // grab the command tag and number of rows affected
diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs
index fa66df0469..994a7de764 100644
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -7,7 +7,6 @@ use crate::{
 };
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream, StreamExt};
-use hashbrown::HashMap;
 use hyper::{
     server::{
         accept,
@@ -18,7 +17,6 @@ use hyper::{
 };
 use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
 use pin_project_lite::pin_project;
-use serde_json::{json, Value};
 
 use std::{
     convert::Infallible,
@@ -204,44 +202,7 @@ async fn ws_handler(
     // TODO: that deserves a refactor as now this function also handles http json client besides websockets.
     // Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
     } else if request.uri().path() == "/sql" && request.method() == Method::POST {
-        let result = sql_over_http::handle(request, sni_hostname, conn_pool, session_id)
-            .instrument(info_span!("sql-over-http"))
-            .await;
-        let status_code = match result {
-            Ok(_) => StatusCode::OK,
-            Err(_) => StatusCode::BAD_REQUEST,
-        };
-        let (json, headers) = match result {
-            Ok(r) => r,
-            Err(e) => {
-                let message = format!("{:?}", e);
-                let code = match e.downcast_ref::<tokio_postgres::Error>() {
-                    Some(e) => match e.code() {
-                        Some(e) => serde_json::to_value(e.code()).unwrap(),
-                        None => Value::Null,
-                    },
-                    None => Value::Null,
-                };
-                error!(
-                    ?code,
-                    "sql-over-http per-client task finished with an error: {e:#}"
-                );
-                (
-                    json!({ "message": message, "code": code }),
-                    HashMap::default(),
-                )
-            }
-        };
-        json_response(status_code, json).map(|mut r| {
-            r.headers_mut().insert(
-                "Access-Control-Allow-Origin",
-                hyper::http::HeaderValue::from_static("*"),
-            );
-            for (k, v) in headers {
-                r.headers_mut().insert(k, v);
-            }
-            r
-        })
+        sql_over_http::handle(request, sni_hostname, conn_pool, session_id).await
     } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
         Response::builder()
             .header("Allow", "OPTIONS, POST")
@@ -253,7 +214,7 @@ async fn ws_handler(
             .header("Access-Control-Max-Age", "86400" /* 24 hours */)
             .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
             .body(Body::empty())
-            .map_err(|e| ApiError::BadRequest(e.into()))
+            .map_err(|e| ApiError::InternalServerError(e.into()))
     } else {
         json_response(StatusCode::BAD_REQUEST, "query is not supported")
     }
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 9279002eb3..cfeec5622b 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -3,9 +3,18 @@
 use crate::{config::MetricCollectionConfig, http};
 use chrono::{DateTime, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
-use serde::Serialize;
-use std::{collections::HashMap, convert::Infallible, time::Duration};
-use tracing::{error, info, instrument, trace, warn};
+use dashmap::{mapref::entry::Entry, DashMap};
+use once_cell::sync::Lazy;
+use serde::{Deserialize, Serialize};
+use std::{
+    convert::Infallible,
+    sync::{
+        atomic::{AtomicU64, AtomicUsize, Ordering},
+        Arc,
+    },
+    time::Duration,
+};
+use tracing::{error, info, instrument, trace};
 
 const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
 
@@ -18,12 +27,95 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
 /// Both the proxy and the ingestion endpoint will live in the same region (or cell)
 /// so while the project-id is unique across regions the whole pipeline will work correctly
 /// because we enrich the event with project_id in the control-plane endpoint.
-#[derive(Eq, Hash, PartialEq, Serialize, Debug, Clone)]
+#[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)]
 pub struct Ids {
     pub endpoint_id: String,
     pub branch_id: String,
 }
 
+#[derive(Debug)]
+pub struct MetricCounter {
+    transmitted: AtomicU64,
+    opened_connections: AtomicUsize,
+}
+
+impl MetricCounter {
+    /// Record that some bytes were sent from the proxy to the client
+    pub fn record_egress(&self, bytes: u64) {
+        self.transmitted.fetch_add(bytes, Ordering::AcqRel);
+    }
+
+    /// extract the value that should be reported
+    fn should_report(self: &Arc<Self>) -> Option<u64> {
+        // heuristic to see if the branch is still open
+        // if a clone happens while we are observing, the heuristic will be incorrect.
+        //
+        // Worst case is that we won't report an event for this endpoint.
+        // However, for the strong count to be 1 it must have occured that at one instant
+        // all the endpoints were closed, so missing a report because the endpoints are closed is valid.
+        let is_open = Arc::strong_count(self) > 1;
+        let opened = self.opened_connections.swap(0, Ordering::AcqRel);
+
+        // update cached metrics eagerly, even if they can't get sent
+        // (to avoid sending the same metrics twice)
+        // see the relevant discussion on why to do so even if the status is not success:
+        // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956
+        let value = self.transmitted.swap(0, Ordering::AcqRel);
+
+        // Our only requirement is that we report in every interval if there was an open connection
+        // if there were no opened connections since, then we don't need to report
+        if value == 0 && !is_open && opened == 0 {
+            None
+        } else {
+            Some(value)
+        }
+    }
+
+    /// Determine whether the counter should be cleared from the global map.
+    fn should_clear(self: &mut Arc<Self>) -> bool {
+        // we can't clear this entry if it's acquired elsewhere
+        let Some(counter) = Arc::get_mut(self) else {
+            return false;
+        };
+        let opened = *counter.opened_connections.get_mut();
+        let value = *counter.transmitted.get_mut();
+        // clear if there's no data to report
+        value == 0 && opened == 0
+    }
+}
+
+// endpoint and branch IDs are not user generated so we don't run the risk of hash-dos
+type FastHasher = std::hash::BuildHasherDefault<rustc_hash::FxHasher>;
+
+#[derive(Default)]
+pub struct Metrics {
+    endpoints: DashMap<Ids, Arc<MetricCounter>, FastHasher>,
+}
+
+impl Metrics {
+    /// Register a new byte metrics counter for this endpoint
+    pub fn register(&self, ids: Ids) -> Arc<MetricCounter> {
+        let entry = if let Some(entry) = self.endpoints.get(&ids) {
+            entry.clone()
+        } else {
+            self.endpoints
+                .entry(ids)
+                .or_insert_with(|| {
+                    Arc::new(MetricCounter {
+                        transmitted: AtomicU64::new(0),
+                        opened_connections: AtomicUsize::new(0),
+                    })
+                })
+                .clone()
+        };
+
+        entry.opened_connections.fetch_add(1, Ordering::AcqRel);
+        entry
+    }
+}
+
+pub static USAGE_METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
+
 pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infallible> {
     info!("metrics collector config: {config:?}");
     scopeguard::defer! {
@@ -31,145 +123,83 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
     }
 
     let http_client = http::new_client_with_timeout(DEFAULT_HTTP_REPORTING_TIMEOUT);
-    let mut cached_metrics: HashMap<Ids, (u64, DateTime<Utc>)> = HashMap::new();
     let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
 
+    let mut prev = Utc::now();
     let mut ticker = tokio::time::interval(config.interval);
     loop {
         ticker.tick().await;
 
-        let res = collect_metrics_iteration(
+        let now = Utc::now();
+        collect_metrics_iteration(
+            &USAGE_METRICS,
             &http_client,
-            &mut cached_metrics,
             &config.endpoint,
             &hostname,
+            prev,
+            now,
         )
         .await;
-
-        match res {
-            Err(e) => error!("failed to send consumption metrics: {e} "),
-            Ok(_) => trace!("periodic metrics collection completed successfully"),
-        }
+        prev = now;
     }
 }
 
-fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
-    let mut current_metrics: Vec<(Ids, (u64, DateTime<Utc>))> = Vec::new();
-    let metrics = prometheus::default_registry().gather();
-
-    for m in metrics {
-        if m.get_name() == "proxy_io_bytes_per_client" {
-            for ms in m.get_metric() {
-                let direction = ms
-                    .get_label()
-                    .iter()
-                    .find(|l| l.get_name() == "direction")
-                    .unwrap()
-                    .get_value();
-
-                // Only collect metric for outbound traffic
-                if direction == "tx" {
-                    let endpoint_id = ms
-                        .get_label()
-                        .iter()
-                        .find(|l| l.get_name() == "endpoint_id")
-                        .unwrap()
-                        .get_value();
-                    let branch_id = ms
-                        .get_label()
-                        .iter()
-                        .find(|l| l.get_name() == "branch_id")
-                        .unwrap()
-                        .get_value();
-
-                    let value = ms.get_counter().get_value() as u64;
-
-                    // Report if the metric value is suspiciously large
-                    if value > (1u64 << 40) {
-                        warn!(
-                            "potentially abnormal counter value: branch_id {} endpoint_id {} val: {}",
-                            branch_id, endpoint_id, value
-                        );
-                    }
-
-                    current_metrics.push((
-                        Ids {
-                            endpoint_id: endpoint_id.to_string(),
-                            branch_id: branch_id.to_string(),
-                        },
-                        (value, Utc::now()),
-                    ));
-                }
-            }
-        }
-    }
-
-    current_metrics
-}
-
 #[instrument(skip_all)]
 async fn collect_metrics_iteration(
+    metrics: &Metrics,
     client: &http::ClientWithMiddleware,
-    cached_metrics: &mut HashMap<Ids, (u64, DateTime<Utc>)>,
     metric_collection_endpoint: &reqwest::Url,
     hostname: &str,
-) -> anyhow::Result<()> {
+    prev: DateTime<Utc>,
+    now: DateTime<Utc>,
+) {
     info!(
         "starting collect_metrics_iteration. metric_collection_endpoint: {}",
         metric_collection_endpoint
     );
 
-    let current_metrics = gather_proxy_io_bytes_per_client();
+    let mut metrics_to_clear = Vec::new();
 
-    let metrics_to_send: Vec<Event<Ids, &'static str>> = current_metrics
+    let metrics_to_send: Vec<(Ids, u64)> = metrics
+        .endpoints
         .iter()
-        .filter_map(|(curr_key, (curr_val, curr_time))| {
-            let mut start_time = *curr_time;
-            let mut value = *curr_val;
-
-            if let Some((prev_val, prev_time)) = cached_metrics.get(curr_key) {
-                // Only send metrics updates if the metric has increased
-                if curr_val > prev_val {
-                    value = curr_val - prev_val;
-                    start_time = *prev_time;
-                } else {
-                    if curr_val < prev_val {
-                        error!("proxy_io_bytes_per_client metric value decreased from {} to {} for key {:?}",
-                        prev_val, curr_val, curr_key);
-                    }
-                    return None;
-                }
+        .filter_map(|counter| {
+            let key = counter.key().clone();
+            let Some(value) = counter.should_report() else {
+                metrics_to_clear.push(key);
+                return None;
             };
-
-            Some(Event {
-                kind: EventType::Incremental {
-                    start_time,
-                    stop_time: *curr_time,
-                },
-                metric: PROXY_IO_BYTES_PER_CLIENT,
-                idempotency_key: idempotency_key(hostname),
-                value,
-                extra: Ids {
-                    endpoint_id: curr_key.endpoint_id.clone(),
-                    branch_id: curr_key.branch_id.clone(),
-                },
-            })
+            Some((key, value))
         })
         .collect();
 
     if metrics_to_send.is_empty() {
         trace!("no new metrics to send");
-        return Ok(());
     }
 
     // Send metrics.
     // Split into chunks of 1000 metrics to avoid exceeding the max request size
     for chunk in metrics_to_send.chunks(CHUNK_SIZE) {
+        let events = chunk
+            .iter()
+            .map(|(ids, value)| Event {
+                kind: EventType::Incremental {
+                    start_time: prev,
+                    stop_time: now,
+                },
+                metric: PROXY_IO_BYTES_PER_CLIENT,
+                idempotency_key: idempotency_key(hostname),
+                value: *value,
+                extra: Ids {
+                    endpoint_id: ids.endpoint_id.clone(),
+                    branch_id: ids.branch_id.clone(),
+                },
+            })
+            .collect();
+
         let res = client
             .post(metric_collection_endpoint.clone())
-            .json(&EventChunk {
-                events: chunk.into(),
-            })
+            .json(&EventChunk { events })
             .send()
             .await;
 
@@ -183,34 +213,113 @@ async fn collect_metrics_iteration(
 
         if !res.status().is_success() {
             error!("metrics endpoint refused the sent metrics: {:?}", res);
-            for metric in chunk.iter().filter(|metric| metric.value > (1u64 << 40)) {
+            for metric in chunk.iter().filter(|(_, value)| *value > (1u64 << 40)) {
                 // Report if the metric value is suspiciously large
                 error!("potentially abnormal metric value: {:?}", metric);
             }
         }
-        // update cached metrics after they were sent
-        // (to avoid sending the same metrics twice)
-        // see the relevant discussion on why to do so even if the status is not success:
-        // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956
-        for send_metric in chunk {
-            let stop_time = match send_metric.kind {
-                EventType::Incremental { stop_time, .. } => stop_time,
-                _ => unreachable!(),
-            };
+    }
 
-            cached_metrics
-                .entry(Ids {
-                    endpoint_id: send_metric.extra.endpoint_id.clone(),
-                    branch_id: send_metric.extra.branch_id.clone(),
-                })
-                // update cached value (add delta) and time
-                .and_modify(|e| {
-                    e.0 = e.0.saturating_add(send_metric.value);
-                    e.1 = stop_time
-                })
-                // cache new metric
-                .or_insert((send_metric.value, stop_time));
+    for metric in metrics_to_clear {
+        match metrics.endpoints.entry(metric) {
+            Entry::Occupied(mut counter) => {
+                if counter.get_mut().should_clear() {
+                    counter.remove_entry();
+                }
+            }
+            Entry::Vacant(_) => {}
         }
     }
-    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{
+        net::TcpListener,
+        sync::{Arc, Mutex},
+    };
+
+    use anyhow::Error;
+    use chrono::Utc;
+    use consumption_metrics::{Event, EventChunk};
+    use hyper::{
+        service::{make_service_fn, service_fn},
+        Body, Response,
+    };
+    use url::Url;
+
+    use super::{collect_metrics_iteration, Ids, Metrics};
+    use crate::http;
+
+    #[tokio::test]
+    async fn metrics() {
+        let listener = TcpListener::bind("0.0.0.0:0").unwrap();
+
+        let reports = Arc::new(Mutex::new(vec![]));
+        let reports2 = reports.clone();
+
+        let server = hyper::server::Server::from_tcp(listener)
+            .unwrap()
+            .serve(make_service_fn(move |_| {
+                let reports = reports.clone();
+                async move {
+                    Ok::<_, Error>(service_fn(move |req| {
+                        let reports = reports.clone();
+                        async move {
+                            let bytes = hyper::body::to_bytes(req.into_body()).await?;
+                            let events: EventChunk<'static, Event<Ids, String>> =
+                                serde_json::from_slice(&bytes)?;
+                            reports.lock().unwrap().push(events);
+                            Ok::<_, Error>(Response::new(Body::from(vec![])))
+                        }
+                    }))
+                }
+            }));
+        let addr = server.local_addr();
+        tokio::spawn(server);
+
+        let metrics = Metrics::default();
+        let client = http::new_client();
+        let endpoint = Url::parse(&format!("http://{addr}")).unwrap();
+        let now = Utc::now();
+
+        // no counters have been registered
+        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
+        let r = std::mem::take(&mut *reports2.lock().unwrap());
+        assert!(r.is_empty());
+
+        // register a new counter
+        let counter = metrics.register(Ids {
+            endpoint_id: "e1".to_string(),
+            branch_id: "b1".to_string(),
+        });
+
+        // the counter should be observed despite 0 egress
+        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
+        let r = std::mem::take(&mut *reports2.lock().unwrap());
+        assert_eq!(r.len(), 1);
+        assert_eq!(r[0].events.len(), 1);
+        assert_eq!(r[0].events[0].value, 0);
+
+        // record egress
+        counter.record_egress(1);
+
+        // egress should be observered
+        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
+        let r = std::mem::take(&mut *reports2.lock().unwrap());
+        assert_eq!(r.len(), 1);
+        assert_eq!(r[0].events.len(), 1);
+        assert_eq!(r[0].events[0].value, 1);
+
+        // release counter
+        drop(counter);
+
+        // we do not observe the counter
+        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
+        let r = std::mem::take(&mut *reports2.lock().unwrap());
+        assert!(r.is_empty());
+
+        // counter is unregistered
+        assert!(metrics.endpoints.is_empty());
+    }
 }
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index f9da145859..71e00ed58f 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -7,6 +7,7 @@ use crate::{
     compute::{self, PostgresConnection},
     config::{ProxyConfig, TlsConfig},
     console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
+    metrics::{Ids, USAGE_METRICS},
     protocol2::WithClientIp,
     stream::{PqStream, Stream},
 };
@@ -602,6 +603,11 @@ pub async fn proxy_pass(
     compute: impl AsyncRead + AsyncWrite + Unpin,
     aux: &MetricsAuxInfo,
 ) -> anyhow::Result<()> {
+    let usage = USAGE_METRICS.register(Ids {
+        endpoint_id: aux.endpoint_id.to_string(),
+        branch_id: aux.branch_id.to_string(),
+    });
+
     let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&aux.traffic_labels("tx"));
     let mut client = MeasuredStream::new(
         client,
@@ -609,6 +615,7 @@ pub async fn proxy_pass(
         |cnt| {
             // Number of bytes we sent to the client (outbound).
             m_sent.inc_by(cnt as u64);
+            usage.record_egress(cnt as u64);
         },
     );
 
@@ -690,7 +697,14 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
             .await
         {
             Ok(auth_result) => auth_result,
-            Err(e) => return stream.throw_error(e).await,
+            Err(e) => {
+                let user = creds.get_user();
+                let db = params.get("database");
+                let app = params.get("application_name");
+                let params_span = tracing::info_span!("", ?user, ?db, ?app);
+
+                return stream.throw_error(e).instrument(params_span).await;
+            }
         };
 
         let AuthSuccess {
diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 1254c4e779..6fbaa08512 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -105,6 +105,8 @@ class NeonCompare(PgCompare):
         self._pg_bin = pg_bin
         self.pageserver_http_client = self.env.pageserver.http_client()
 
+        # note that neon_simple_env now uses LOCAL_FS remote storage
+
         # Create tenant
         tenant_conf: Dict[str, str] = {}
         if False:  # TODO add pytest setting for this
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0667403ba3..92e7cd06cd 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -460,9 +460,11 @@ class NeonEnvBuilder:
         ), "Unexpectedly instantiated from outside a test function"
         self.test_name = test_name
 
-    def init_configs(self) -> NeonEnv:
+    def init_configs(self, default_remote_storage_if_missing: bool = True) -> NeonEnv:
         # Cannot create more than one environment from one builder
         assert self.env is None, "environment already initialized"
+        if default_remote_storage_if_missing and self.pageserver_remote_storage is None:
+            self.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
         self.env = NeonEnv(self)
         return self.env
 
@@ -470,8 +472,19 @@ class NeonEnvBuilder:
         assert self.env is not None, "environment is not already initialized, call init() first"
         self.env.start()
 
-    def init_start(self, initial_tenant_conf: Optional[Dict[str, str]] = None) -> NeonEnv:
-        env = self.init_configs()
+    def init_start(
+        self,
+        initial_tenant_conf: Optional[Dict[str, str]] = None,
+        default_remote_storage_if_missing: bool = True,
+    ) -> NeonEnv:
+        """
+        Default way to create and start NeonEnv. Also creates the initial_tenant with root initial_timeline.
+
+        To avoid creating initial_tenant, call init_configs to setup the environment.
+
+        Configuring pageserver with remote storage is now the default. There will be a warning if pageserver is created without one.
+        """
+        env = self.init_configs(default_remote_storage_if_missing=default_remote_storage_if_missing)
         self.start()
 
         # Prepare the default branch to start the postgres on later.
@@ -546,7 +559,7 @@ class NeonEnvBuilder:
         user: RemoteStorageUser,
         bucket_name: Optional[str] = None,
         bucket_region: Optional[str] = None,
-    ) -> Optional[RemoteStorage]:
+    ) -> RemoteStorage:
         ret = kind.configure(
             self.repo_dir,
             self.mock_s3_server,
@@ -889,6 +902,8 @@ def _shared_simple_env(
     """
     # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
      is set, this is shared by all tests using `neon_simple_env`.
+
+    This fixture will use RemoteStorageKind.LOCAL_FS with pageserver.
     """
 
     if os.environ.get("TEST_SHARED_FIXTURES") is None:
@@ -1481,6 +1496,16 @@ class NeonAttachmentService:
             self.running = False
         return self
 
+    def attach_hook(self, tenant_id: TenantId, pageserver_id: int) -> int:
+        response = requests.post(
+            f"{self.env.control_plane_api}/attach_hook",
+            json={"tenant_id": str(tenant_id), "pageserver_id": pageserver_id},
+        )
+        response.raise_for_status()
+        gen = response.json()["gen"]
+        assert isinstance(gen, int)
+        return gen
+
     def __enter__(self) -> "NeonAttachmentService":
         return self
 
@@ -1689,12 +1714,7 @@ class NeonPageserver(PgProtocol):
         to call into the pageserver HTTP client.
         """
         if self.env.attachment_service is not None:
-            response = requests.post(
-                f"{self.env.control_plane_api}/attach_hook",
-                json={"tenant_id": str(tenant_id), "pageserver_id": self.id},
-            )
-            response.raise_for_status()
-            generation = response.json()["gen"]
+            generation = self.env.attachment_service.attach_hook(tenant_id, self.id)
         else:
             generation = None
 
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 9373073abf..9fdcd22bc2 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -620,3 +620,8 @@ class PageserverHttpClient(requests.Session):
             },
         )
         self.verbose_error(res)
+
+    def deletion_queue_flush(self, execute: bool = False):
+        self.put(
+            f"http://localhost:{self.port}/v1/deletion_queue/flush?execute={'true' if execute else 'false'}"
+        ).raise_for_status()
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 2e5d75a0fc..70c2a06a07 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -267,7 +267,7 @@ def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional
 
 
 def list_prefix(
-    neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None
+    neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None, delimiter: str = "/"
 ) -> ListObjectsV2OutputTypeDef:
     """
     Note that this function takes into account prefix_in_bucket.
@@ -287,7 +287,7 @@ def list_prefix(
 
     # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
     response = remote.client.list_objects_v2(
-        Delimiter="/",
+        Delimiter=delimiter,
         Bucket=remote.bucket_name,
         Prefix=prefix,
     )
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index f7cddbc821..535f8c2fe7 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -202,9 +202,6 @@ class RemoteStorageKind(str, enum.Enum):
     LOCAL_FS = "local_fs"
     MOCK_S3 = "mock_s3"
     REAL_S3 = "real_s3"
-    # Pass to tests that are generic to remote storage
-    # to ensure the test pass with or without the remote storage
-    NOOP = "noop"
 
     def configure(
         self,
@@ -215,10 +212,7 @@ class RemoteStorageKind(str, enum.Enum):
         user: RemoteStorageUser,
         bucket_name: Optional[str] = None,
         bucket_region: Optional[str] = None,
-    ) -> Optional[RemoteStorage]:
-        if self == RemoteStorageKind.NOOP:
-            return None
-
+    ) -> RemoteStorage:
         if self == RemoteStorageKind.LOCAL_FS:
             return LocalFsStorage(LocalFsStorage.component_path(repo_dir, user))
 
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index d0462844f0..c80f2d8360 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -4,7 +4,12 @@ from typing import List, Tuple
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder
+from fixtures.neon_fixtures import (
+    Endpoint,
+    NeonEnv,
+    NeonEnvBuilder,
+    wait_for_last_flush_lsn,
+)
 from fixtures.types import TenantId, TimelineId
 
 
@@ -26,17 +31,18 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
 
     tenant_timelines: List[Tuple[TenantId, TimelineId, Endpoint]] = []
 
-    for _ in range(4):
+    for _ in range(3):
         tenant_id, timeline_id = env.neon_cli.create_tenant()
 
         endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
         with endpoint.cursor() as cur:
             cur.execute("CREATE TABLE t(key int primary key, value text)")
             cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'")
+            wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
         endpoint.stop()
         tenant_timelines.append((tenant_id, timeline_id, endpoint))
 
-    # Stop the pageserver
+    # Stop the pageserver -- this has to be not immediate or we need to wait for uploads
     env.pageserver.stop()
 
     # Leave the first timeline alone, but corrupt the others in different ways
@@ -45,30 +51,21 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
 
     (tenant1, timeline1, pg1) = tenant_timelines[1]
     metadata_path = f"{env.pageserver.workdir}/tenants/{tenant1}/timelines/{timeline1}/metadata"
-    f = open(metadata_path, "w")
-    f.write("overwritten with garbage!")
-    f.close()
+    with open(metadata_path, "w") as f:
+        f.write("overwritten with garbage!")
     log.info(f"Timeline {tenant1}/{timeline1} got its metadata spoiled")
 
     (tenant2, timeline2, pg2) = tenant_timelines[2]
     timeline_path = f"{env.pageserver.workdir}/tenants/{tenant2}/timelines/{timeline2}/"
-    for filename in os.listdir(timeline_path):
-        if filename.startswith("00000"):
-            # Looks like a layer file. Remove it
-            os.remove(f"{timeline_path}/{filename}")
-    log.info(
-        f"Timeline {tenant2}/{timeline2} got its layer files removed (no remote storage enabled)"
-    )
-
-    (tenant3, timeline3, pg3) = tenant_timelines[3]
-    timeline_path = f"{env.pageserver.workdir}/tenants/{tenant3}/timelines/{timeline3}/"
     for filename in os.listdir(timeline_path):
         if filename.startswith("00000"):
             # Looks like a layer file. Corrupt it
-            f = open(f"{timeline_path}/{filename}", "w")
-            f.write("overwritten with garbage!")
-            f.close()
-    log.info(f"Timeline {tenant3}/{timeline3} got its layer files spoiled")
+            p = f"{timeline_path}/{filename}"
+            size = os.path.getsize(p)
+            with open(p, "wb") as f:
+                f.truncate(0)
+                f.truncate(size)
+    log.info(f"Timeline {tenant2}/{timeline2} got its local layer files spoiled")
 
     env.pageserver.start()
 
@@ -87,22 +84,13 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
         f"As expected, compute startup failed eagerly for timeline with corrupt metadata: {err}"
     )
 
-    # Second timeline has no ancestors, only the metadata file and no layer files locally,
-    # and we don't have the remote storage enabled. It is loaded into memory, but getting
-    # the basebackup from it will fail.
-    with pytest.raises(
-        Exception, match=f"Tenant {tenant2} will not become active. Current state: Broken"
-    ) as err:
-        pg2.start()
-    log.info(f"As expected, compute startup failed for timeline with missing layers: {err}")
-
-    # Third timeline will also fail during basebackup, because the layer file is corrupt.
+    # Second timeline will fail during basebackup, because the local layer file is corrupt.
     # It will fail when we try to read (and reconstruct) a page from it, ergo the error message.
     # (We don't check layer file contents on startup, when loading the timeline)
     with pytest.raises(Exception, match="Failed to load delta layer") as err:
-        pg3.start()
+        pg2.start()
     log.info(
-        f"As expected, compute startup failed for timeline {tenant3}/{timeline3} with corrupt layers: {err}"
+        f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"
     )
 
 
diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py
index 740e489759..d4cf1b4739 100644
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -211,4 +211,12 @@ def test_ddl_forwarding(ddl: DdlForwardingContext):
         ddl.wait()
 
     ddl.failures(False)
+    cur.execute("CREATE DATABASE failure WITH OWNER=cork")
+    ddl.wait()
+    with pytest.raises(psycopg2.InternalError):
+        ddl.failures(True)
+        cur.execute("DROP DATABASE failure")
+        ddl.wait()
+    ddl.pg.connect(dbname="failure")  # Ensure we can connect after a failed drop
+
     conn.close()
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index ae62fdf4a4..f3f3a1ddf3 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -74,11 +74,13 @@ class EvictionEnv:
     pgbench_init_lsns: Dict[TenantId, Lsn]
 
     def timelines_du(self) -> Tuple[int, int, int]:
-        return poor_mans_du(self.neon_env, [(tid, tlid) for tid, tlid in self.timelines])
+        return poor_mans_du(
+            self.neon_env, [(tid, tlid) for tid, tlid in self.timelines], verbose=False
+        )
 
     def du_by_timeline(self) -> Dict[Tuple[TenantId, TimelineId], int]:
         return {
-            (tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)])[0]
+            (tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)], verbose=True)[0]
             for tid, tlid in self.timelines
         }
 
@@ -89,7 +91,21 @@ class EvictionEnv:
         """
         lsn = self.pgbench_init_lsns[tenant_id]
         with self.neon_env.endpoints.create_start("main", tenant_id=tenant_id, lsn=lsn) as endpoint:
-            self.pg_bin.run(["pgbench", "-S", endpoint.connstr()])
+            # instead of using pgbench --select-only which does point selects,
+            # run full table scans for all tables
+            with endpoint.connect() as conn:
+                cur = conn.cursor()
+
+                tables_cols = {
+                    "pgbench_accounts": "abalance",
+                    "pgbench_tellers": "tbalance",
+                    "pgbench_branches": "bbalance",
+                    "pgbench_history": "delta",
+                }
+
+                for table, column in tables_cols.items():
+                    cur.execute(f"select avg({column}) from {table}")
+                    _avg = cur.fetchone()
 
     def pageserver_start_with_disk_usage_eviction(
         self, period, max_usage_pct, min_avail_bytes, mock_behavior
@@ -127,6 +143,19 @@ class EvictionEnv:
         self.neon_env.pageserver.allowed_errors.append(".*WARN.* disk usage still high.*")
 
 
+def human_bytes(amt: float) -> str:
+    suffixes = ["", "Ki", "Mi", "Gi"]
+
+    last = suffixes[-1]
+
+    for name in suffixes:
+        if amt < 1024 or name == last:
+            return f"{int(round(amt))} {name}B"
+        amt = amt / 1024
+
+    raise RuntimeError("unreachable")
+
+
 @pytest.fixture
 def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv:
     """
@@ -215,8 +244,12 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
 
     healthy_tenant_id, healthy_timeline_id = env.timelines[1]
 
-    broken_size_pre, _, _ = poor_mans_du(env.neon_env, [(broken_tenant_id, broken_timeline_id)])
-    healthy_size_pre, _, _ = poor_mans_du(env.neon_env, [(healthy_tenant_id, healthy_timeline_id)])
+    broken_size_pre, _, _ = poor_mans_du(
+        env.neon_env, [(broken_tenant_id, broken_timeline_id)], verbose=True
+    )
+    healthy_size_pre, _, _ = poor_mans_du(
+        env.neon_env, [(healthy_tenant_id, healthy_timeline_id)], verbose=True
+    )
 
     # try to evict everything, then validate that broken tenant wasn't touched
     target = broken_size_pre + healthy_size_pre
@@ -224,8 +257,12 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
     response = env.pageserver_http.disk_usage_eviction_run({"evict_bytes": target})
     log.info(f"{response}")
 
-    broken_size_post, _, _ = poor_mans_du(env.neon_env, [(broken_tenant_id, broken_timeline_id)])
-    healthy_size_post, _, _ = poor_mans_du(env.neon_env, [(healthy_tenant_id, healthy_timeline_id)])
+    broken_size_post, _, _ = poor_mans_du(
+        env.neon_env, [(broken_tenant_id, broken_timeline_id)], verbose=True
+    )
+    healthy_size_post, _, _ = poor_mans_du(
+        env.neon_env, [(healthy_tenant_id, healthy_timeline_id)], verbose=True
+    )
 
     assert broken_size_pre == broken_size_post, "broken tenant should not be touched"
     assert healthy_size_post < healthy_size_pre
@@ -366,18 +403,16 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
     du_by_timeline = env.du_by_timeline()
 
     # pick any tenant
-    [our_tenant, other_tenant] = list(du_by_timeline.keys())
-    (tenant_id, timeline_id) = our_tenant
+    [warm, cold] = list(du_by_timeline.keys())
+    (tenant_id, timeline_id) = warm
 
-    # make our tenant more recently used than the other one
+    # make picked tenant more recently used than the other one
     env.warm_up_tenant(tenant_id)
 
     # Build up enough pressure to require evictions from both tenants,
     # but not enough to fall into global LRU.
-    # So, set target to all occipied space, except 2*env.layer_size per tenant
-    target = (
-        du_by_timeline[other_tenant] + (du_by_timeline[our_tenant] // 2) - 2 * 2 * env.layer_size
-    )
+    # So, set target to all occupied space, except 2*env.layer_size per tenant
+    target = du_by_timeline[cold] + (du_by_timeline[warm] // 2) - 2 * 2 * env.layer_size
     response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
     log.info(f"{response}")
 
@@ -392,22 +427,33 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
             later_tenant_usage < du_by_timeline[tenant]
         ), "all tenants should have lost some layers"
 
+    warm_size = later_du_by_timeline[warm]
+
+    # bounds for warmed_size
+    warm_lower = 0.5 * du_by_timeline[warm]
+
+    # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
+    # So, check for up to 3 here.
+    warm_upper = warm_lower + 3 * env.layer_size
+
+    cold_size = later_du_by_timeline[cold]
+    cold_upper = 2 * env.layer_size
+
+    log.info(
+        f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}"
+    )
+    log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}")
+
+    assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)"
+    assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)"
+
     assert (
-        later_du_by_timeline[our_tenant] > 0.5 * du_by_timeline[our_tenant]
-    ), "our warmed up tenant should be at about half capacity, part 1"
-    assert (
-        # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
-        # So, check for up to 3 here.
-        later_du_by_timeline[our_tenant]
-        < 0.5 * du_by_timeline[our_tenant] + 3 * env.layer_size
-    ), "our warmed up tenant should be at about half capacity, part 2"
-    assert (
-        later_du_by_timeline[other_tenant] < 2 * env.layer_size
-    ), "the other tenant should be evicted to is min_resident_size, i.e., max layer file size"
+        cold_size < cold_upper
+    ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size"
 
 
 def poor_mans_du(
-    env: NeonEnv, timelines: list[Tuple[TenantId, TimelineId]]
+    env: NeonEnv, timelines: list[Tuple[TenantId, TimelineId]], verbose: bool = False
 ) -> Tuple[int, int, int]:
     """
     Disk usage, largest, smallest layer for layer files over the given (tenant, timeline) tuples;
@@ -430,9 +476,11 @@ def poor_mans_du(
                 smallest_layer = min(smallest_layer, size)
             else:
                 smallest_layer = size
-            log.info(f"{tenant_id}/{timeline_id} => {file.name} {size}")
+            if verbose:
+                log.info(f"{tenant_id}/{timeline_id} => {file.name} {size} ({human_bytes(size)})")
 
-        log.info(f"{tenant_id}/{timeline_id}: sum {total}")
+        if verbose:
+            log.info(f"{tenant_id}/{timeline_id}: sum {total} ({human_bytes(total)})")
         total_on_disk += total
 
     assert smallest_layer is not None or total_on_disk == 0 and largest_layer == 0
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
new file mode 100644
index 0000000000..81d38ac934
--- /dev/null
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -0,0 +1,352 @@
+"""
+
+Tests in this module exercise the pageserver's behavior around generation numbers,
+as defined in docs/rfcs/025-generation-numbers.md.  Briefly, the behaviors we require
+of the pageserver are:
+- Do not start a tenant without a generation number if control_plane_api is set
+- Remote objects must be suffixed with generation
+- Deletions may only be executed after validating generation
+- Updates to remote_consistent_lsn may only be made visible after validating generation
+"""
+
+
+import re
+import time
+from typing import Optional
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+    last_flush_lsn_upload,
+    wait_for_last_flush_lsn,
+)
+from fixtures.pageserver.utils import list_prefix
+from fixtures.remote_storage import (
+    RemoteStorageKind,
+)
+from fixtures.types import TenantId, TimelineId
+from fixtures.utils import print_gc_result, wait_until
+
+# A tenant configuration that is convenient for generating uploads and deletions
+# without a large amount of postgres traffic.
+TENANT_CONF = {
+    # small checkpointing and compaction targets to ensure we generate many upload operations
+    "checkpoint_distance": f"{128 * 1024}",
+    "compaction_threshold": "1",
+    "compaction_target_size": f"{128 * 1024}",
+    # no PITR horizon, we specify the horizon when we request on-demand GC
+    "pitr_interval": "0s",
+    # disable background compaction and GC. We invoke it manually when we want it to happen.
+    "gc_period": "0s",
+    "compaction_period": "0s",
+    # create image layers eagerly, so that GC can remove some layers
+    "image_creation_threshold": "1",
+}
+
+
+def generate_uploads_and_deletions(
+    env: NeonEnv,
+    *,
+    init: bool = True,
+    tenant_id: Optional[TenantId] = None,
+    timeline_id: Optional[TimelineId] = None,
+    data: Optional[str] = None,
+):
+    """
+    Using the environment's default tenant + timeline, generate a load pattern
+    that results in some uploads and some deletions to remote storage.
+    """
+
+    if tenant_id is None:
+        tenant_id = env.initial_tenant
+    assert tenant_id is not None
+
+    if timeline_id is None:
+        timeline_id = env.initial_timeline
+    assert timeline_id is not None
+
+    ps_http = env.pageserver.http_client()
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        if init:
+            endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
+            last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+
+        def churn(data):
+            endpoint.safe_psql_many(
+                [
+                    f"""
+                INSERT INTO foo (id, val)
+                SELECT g, '{data}'
+                FROM generate_series(1, 20000) g
+                ON CONFLICT (id) DO UPDATE
+                SET val = EXCLUDED.val
+                """,
+                    # to ensure that GC can actually remove some layers
+                    "VACUUM foo",
+                ]
+            )
+            assert tenant_id is not None
+            assert timeline_id is not None
+            wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+            ps_http.timeline_checkpoint(tenant_id, timeline_id)
+
+        # Compaction should generate some GC-elegible layers
+        for i in range(0, 2):
+            churn(f"{i if data is None else data}")
+
+        gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0)
+        print_gc_result(gc_result)
+        assert gc_result["layers_removed"] > 0
+
+
+def get_metric_or_0(ps_http, metric: str) -> int:
+    v = ps_http.get_metric_value(metric)
+    return 0 if v is None else int(v)
+
+
+def get_deletion_queue_executed(ps_http) -> int:
+    return get_metric_or_0(ps_http, "pageserver_deletion_queue_executed_total")
+
+
+def get_deletion_queue_submitted(ps_http) -> int:
+    return get_metric_or_0(ps_http, "pageserver_deletion_queue_submitted_total")
+
+
+def get_deletion_queue_dropped(ps_http) -> int:
+    return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_total")
+
+
+def get_deletion_queue_unexpected_errors(ps_http) -> int:
+    return get_metric_or_0(ps_http, "pageserver_deletion_queue_unexpected_errors_total")
+
+
+def get_deletion_queue_dropped_lsn_updates(ps_http) -> int:
+    return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_lsn_updates_total")
+
+
+def get_deletion_queue_depth(ps_http) -> int:
+    """
+    Queue depth if at least one deletion has been submitted, else None
+    """
+    submitted = get_deletion_queue_submitted(ps_http)
+    executed = get_deletion_queue_executed(ps_http)
+    dropped = get_deletion_queue_dropped(ps_http)
+    depth = submitted - executed - dropped
+    log.info(f"get_deletion_queue_depth: {depth} ({submitted} - {executed} - {dropped})")
+
+    assert depth >= 0
+    return int(depth)
+
+
+def assert_deletion_queue(ps_http, size_fn) -> None:
+    v = get_deletion_queue_depth(ps_http)
+    assert v is not None
+    assert size_fn(v) is True
+
+
+def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
+    """
+    Validate behavior when a pageserver is run without generation support enabled,
+    then started again after activating it:
+    - Before upgrade, no objects should have generation suffixes
+    - After upgrade, the bucket should contain a mixture.
+    - In both cases, postgres I/O should work.
+    """
+    neon_env_builder.enable_generations = True
+    neon_env_builder.enable_pageserver_remote_storage(
+        RemoteStorageKind.MOCK_S3,
+    )
+
+    env = neon_env_builder.init_configs()
+    env.broker.try_start()
+    for sk in env.safekeepers:
+        sk.start()
+    assert env.attachment_service is not None
+    env.attachment_service.start()
+
+    env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
+
+    env.neon_cli.create_tenant(
+        tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline
+    )
+    generate_uploads_and_deletions(env)
+
+    def parse_generation_suffix(key):
+        m = re.match(".+-([0-9a-zA-Z]{8})$", key)
+        if m is None:
+            return None
+        else:
+            log.info(f"match: {m}")
+            log.info(f"group: {m.group(1)}")
+            return int(m.group(1), 16)
+
+    pre_upgrade_keys = list(
+        [o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]]
+    )
+    for key in pre_upgrade_keys:
+        assert parse_generation_suffix(key) is None
+
+    env.pageserver.stop()
+
+    # Starting without the override that disabled control_plane_api
+    env.pageserver.start()
+
+    generate_uploads_and_deletions(env, init=False)
+
+    legacy_objects: list[str] = []
+    suffixed_objects = []
+    post_upgrade_keys = list(
+        [o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]]
+    )
+    for key in post_upgrade_keys:
+        log.info(f"post-upgrade key: {key}")
+        if parse_generation_suffix(key) is not None:
+            suffixed_objects.append(key)
+        else:
+            legacy_objects.append(key)
+
+    # Bucket now contains a mixture of suffixed and non-suffixed objects
+    assert len(suffixed_objects) > 0
+    assert len(legacy_objects) > 0
+
+    assert get_deletion_queue_unexpected_errors(env.pageserver.http_client()) == 0
+
+
+def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_generations = True
+    neon_env_builder.enable_pageserver_remote_storage(
+        RemoteStorageKind.MOCK_S3,
+    )
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+    assert env.attachment_service is not None
+
+    some_other_pageserver = 1234
+    ps_http = env.pageserver.http_client()
+
+    generate_uploads_and_deletions(env)
+
+    # Flush: pending deletions should all complete
+    assert_deletion_queue(ps_http, lambda n: n > 0)
+    ps_http.deletion_queue_flush(execute=True)
+    assert_deletion_queue(ps_http, lambda n: n == 0)
+    assert get_deletion_queue_dropped(ps_http) == 0
+
+    # Our visible remote_consistent_lsn should match projected
+    timeline = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
+    assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"]
+    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
+
+    env.pageserver.allowed_errors.extend(
+        [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
+    )
+
+    # Now advance the generation in the control plane: subsequent validations
+    # from the running pageserver will fail.  No more deletions should happen.
+    env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver)
+    generate_uploads_and_deletions(env, init=False)
+
+    assert_deletion_queue(ps_http, lambda n: n > 0)
+    queue_depth_before = get_deletion_queue_depth(ps_http)
+    executed_before = get_deletion_queue_executed(ps_http)
+    ps_http.deletion_queue_flush(execute=True)
+
+    # Queue drains to zero because we dropped deletions
+    assert_deletion_queue(ps_http, lambda n: n == 0)
+    # The executed counter has not incremented
+    assert get_deletion_queue_executed(ps_http) == executed_before
+    # The dropped counter has incremented to consume all of the deletions that were previously enqueued
+    assert get_deletion_queue_dropped(ps_http) == queue_depth_before
+
+    # Flush to S3 and see that remote_consistent_lsn does not advance: it cannot
+    # because generation validation fails.
+    timeline = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
+    assert timeline["remote_consistent_lsn"] != timeline["remote_consistent_lsn_visible"]
+    assert get_deletion_queue_dropped_lsn_updates(ps_http) > 0
+
+    # TODO: list bucket and confirm all objects have a generation suffix.
+
+    assert get_deletion_queue_unexpected_errors(ps_http) == 0
+
+
+@pytest.mark.parametrize("keep_attachment", [True, False])
+def test_deletion_queue_recovery(
+    neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, keep_attachment: bool
+):
+    """
+    :param keep_attachment: If true, we re-attach after restart.  Else, we act as if some other
+    node took the attachment while we were restarting.
+    """
+    neon_env_builder.enable_generations = True
+    neon_env_builder.enable_pageserver_remote_storage(
+        RemoteStorageKind.MOCK_S3,
+    )
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+
+    ps_http = env.pageserver.http_client()
+
+    # Prevent deletion lists from being executed, to build up some backlog of deletions
+    ps_http.configure_failpoints(
+        [
+            ("deletion-queue-before-execute", "return"),
+        ]
+    )
+
+    generate_uploads_and_deletions(env)
+
+    # There should be entries in the deletion queue
+    assert_deletion_queue(ps_http, lambda n: n > 0)
+    ps_http.deletion_queue_flush()
+    before_restart_depth = get_deletion_queue_depth(ps_http)
+
+    assert get_deletion_queue_unexpected_errors(ps_http) == 0
+    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
+
+    log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
+    env.pageserver.stop(immediate=True)
+
+    if not keep_attachment:
+        some_other_pageserver = 101010
+        assert env.attachment_service is not None
+        env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver)
+
+    env.pageserver.start()
+
+    def assert_deletions_submitted(n: int):
+        assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n
+
+    # After restart, issue a flush to kick the deletion frontend to do recovery.
+    # It should recover all the operations we submitted before the restart.
+    ps_http.deletion_queue_flush(execute=False)
+    wait_until(20, 0.25, lambda: assert_deletions_submitted(before_restart_depth))
+
+    # The queue should drain through completely if we flush it
+    ps_http.deletion_queue_flush(execute=True)
+    wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0))
+
+    if keep_attachment:
+        # If we kept the attachment, then our pre-restart deletions should have executed
+        # successfully
+        assert get_deletion_queue_executed(ps_http) == before_restart_depth
+    else:
+        # If we lost the attachment, we should have dropped our pre-restart deletions.
+        assert get_deletion_queue_dropped(ps_http) == before_restart_depth
+        env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
+
+    assert get_deletion_queue_unexpected_errors(ps_http) == 0
+    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
+
+    # Restart again
+    env.pageserver.stop(immediate=True)
+    env.pageserver.start()
+
+    # No deletion lists should be recovered: this demonstrates that deletion lists
+    # were cleaned up after being executed or dropped in the previous process lifetime.
+    time.sleep(1)
+    assert_deletion_queue(ps_http, lambda n: n == 0)
+
+    assert get_deletion_queue_unexpected_errors(ps_http) == 0
+    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py
index dae39d2752..b76dbbee03 100644
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -5,7 +5,6 @@ from pathlib import Path
 from queue import SimpleQueue
 from typing import Any, Dict, Set
 
-import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -17,15 +16,13 @@ from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
 
+# TODO: collect all of the env setup *AFTER* removal of RemoteStorageKind.NOOP
+
 
-@pytest.mark.parametrize(
-    "remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.LOCAL_FS]
-)
 def test_metric_collection(
     httpserver: HTTPServer,
     neon_env_builder: NeonEnvBuilder,
     httpserver_listen_address,
-    remote_storage_kind: RemoteStorageKind,
 ):
     (host, port) = httpserver_listen_address
     metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
@@ -55,7 +52,7 @@ def test_metric_collection(
         synthetic_size_calculation_interval="3s"
         """
 
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
     log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}")
 
@@ -68,6 +65,14 @@ def test_metric_collection(
     env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
     # httpserver is shut down before pageserver during passing run
     env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
+    # we have a fast rate of calculation, these can happen at shutdown
+    env.pageserver.allowed_errors.append(
+        ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*"
+    )
+    env.pageserver.allowed_errors.append(
+        ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes"
+    )
+
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
     endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
@@ -98,17 +103,14 @@ def test_metric_collection(
             total += sample[2]
         return int(total)
 
-    remote_uploaded = 0
-
     # upload some data to remote storage
-    if remote_storage_kind == RemoteStorageKind.LOCAL_FS:
-        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
-        pageserver_http = env.pageserver.http_client()
-        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-        pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+    pageserver_http = env.pageserver.http_client()
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+    pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
 
-        remote_uploaded = get_num_remote_ops("index", "upload")
-        assert remote_uploaded > 0
+    remote_uploaded = get_num_remote_ops("index", "upload")
+    assert remote_uploaded > 0
 
     # we expect uploads at 1Hz, on busy runners this could be too optimistic,
     # so give 5s we only want to get the following upload after "ready" value.
@@ -211,6 +213,14 @@ def test_metric_collection_cleans_up_tempfile(
 
     # httpserver is shut down before pageserver during passing run
     env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
+    # we have a fast rate of calculation, these can happen at shutdown
+    env.pageserver.allowed_errors.append(
+        ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*"
+    )
+    env.pageserver.allowed_errors.append(
+        ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes"
+    )
+
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
     endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index c6ddb54ee6..3a56ca51a6 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -30,9 +30,7 @@ from fixtures.types import TenantId
 from fixtures.utils import run_pg_bench_small
 
 
-@pytest.mark.parametrize(
-    "remote_storage_kind", [RemoteStorageKind.NOOP, *available_remote_storages()]
-)
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
 def test_tenant_delete_smoke(
     neon_env_builder: NeonEnvBuilder,
     remote_storage_kind: RemoteStorageKind,
@@ -43,6 +41,12 @@ def test_tenant_delete_smoke(
     neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
 
     env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.extend(
+        [
+            # The deletion queue will complain when it encounters simulated S3 errors
+            ".*deletion executor: DeleteObjects request failed.*",
+        ]
+    )
 
     # lucky race with stopping from flushing a layer we fail to schedule any uploads
     env.pageserver.allowed_errors.append(
@@ -138,18 +142,12 @@ FAILPOINTS_BEFORE_BACKGROUND = [
 def combinations():
     result = []
 
-    remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3]
+    remotes = [RemoteStorageKind.MOCK_S3]
     if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"):
         remotes.append(RemoteStorageKind.REAL_S3)
 
     for remote_storage_kind in remotes:
         for delete_failpoint in FAILPOINTS:
-            if remote_storage_kind is RemoteStorageKind.NOOP and delete_failpoint in (
-                "timeline-delete-before-index-delete",
-            ):
-                # the above failpoint are not relevant for config without remote storage
-                continue
-
             # Simulate failures for only one type of remote storage
             # to avoid log pollution and make tests run faster
             if remote_storage_kind is RemoteStorageKind.MOCK_S3:
@@ -195,27 +193,32 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
         ]
     )
 
+    if simulate_failures:
+        env.pageserver.allowed_errors.extend(
+            [
+                # The deletion queue will complain when it encounters simulated S3 errors
+                ".*deletion executor: DeleteObjects request failed.*",
+            ]
+        )
+
     ps_http = env.pageserver.http_client()
 
     timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id)
     with env.endpoints.create_start("delete", tenant_id=tenant_id) as endpoint:
         # generate enough layers
         run_pg_bench_small(pg_bin, endpoint.connstr())
-        if remote_storage_kind is RemoteStorageKind.NOOP:
-            wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
-        else:
-            last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+        last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
 
-            if remote_storage_kind in available_s3_storages():
-                assert_prefix_not_empty(
-                    neon_env_builder,
-                    prefix="/".join(
-                        (
-                            "tenants",
-                            str(tenant_id),
-                        )
-                    ),
-                )
+        if remote_storage_kind in available_s3_storages():
+            assert_prefix_not_empty(
+                neon_env_builder,
+                prefix="/".join(
+                    (
+                        "tenants",
+                        str(tenant_id),
+                    )
+                ),
+            )
 
     ps_http.configure_failpoints((failpoint, "return"))
 
@@ -246,12 +249,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
         env.pageserver.stop()
         env.pageserver.start()
 
-        if (
-            remote_storage_kind is RemoteStorageKind.NOOP
-            and failpoint == "tenant-delete-before-create-local-mark"
-        ):
-            tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
-        elif failpoint in (
+        if failpoint in (
             "tenant-delete-before-shutdown",
             "tenant-delete-before-create-remote-mark",
         ):
@@ -383,6 +381,7 @@ def test_tenant_delete_is_resumed_on_attach(
     assert not tenant_path.exists()
 
     if remote_storage_kind in available_s3_storages():
+        ps_http.deletion_queue_flush(execute=True)
         assert_prefix_empty(
             neon_env_builder,
             prefix="/".join(
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 8ccbcf551d..a20523b1f3 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -519,11 +519,8 @@ def test_detach_while_attaching(
 # * restart the pageserver and verify that ignored tenant is still not loaded
 # * `load` the same tenant
 # * ensure that it's status is `Active` and it's present in pageserver's memory with all timelines
-@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3])
-def test_ignored_tenant_reattach(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
-):
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+def test_ignored_tenant_reattach(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
 
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 49a6ca5a53..7cea301a9c 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -15,7 +15,7 @@ from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
     wait_until_tenant_active,
 )
-from fixtures.pg_version import PgVersion, xfail_on_postgres
+from fixtures.pg_version import PgVersion
 from fixtures.types import Lsn, TenantId, TimelineId
 
 
@@ -532,7 +532,24 @@ def test_single_branch_get_tenant_size_grows(
     assert size_after == prev, "size after restarting pageserver should not have changed"
 
 
-@xfail_on_postgres(PgVersion.V15, reason="Test significantly more flaky on Postgres 15")
+def assert_size_approx_equal(size_a, size_b):
+    """
+    Tests that evaluate sizes are checking the pageserver space consumption
+    that sits many layers below the user input.  The exact space needed
+    varies slightly depending on postgres behavior.
+
+    Rather than expecting postgres to be determinstic and occasionally
+    failing the test, we permit sizes for the same data to vary by a few pages.
+    """
+
+    # Determined empirically from examples of equality failures: they differ
+    # by page multiples of 8272, and usually by 1-3 pages.  Tolerate 4 to avoid
+    # failing on outliers from that observed range.
+    threshold = 4 * 8272
+
+    assert size_a == pytest.approx(size_b, abs=threshold)
+
+
 def test_get_tenant_size_with_multiple_branches(
     neon_env_builder: NeonEnvBuilder, test_output_dir: Path
 ):
@@ -573,7 +590,7 @@ def test_get_tenant_size_with_multiple_branches(
     )
 
     size_after_first_branch = http_client.tenant_size(tenant_id)
-    assert size_after_first_branch == size_at_branch
+    assert_size_approx_equal(size_after_first_branch, size_at_branch)
 
     first_branch_endpoint = env.endpoints.create_start("first-branch", tenant_id=tenant_id)
 
@@ -599,7 +616,7 @@ def test_get_tenant_size_with_multiple_branches(
         "second-branch", main_branch_name, tenant_id
     )
     size_after_second_branch = http_client.tenant_size(tenant_id)
-    assert size_after_second_branch == size_after_continuing_on_main
+    assert_size_approx_equal(size_after_second_branch, size_after_continuing_on_main)
 
     second_branch_endpoint = env.endpoints.create_start("second-branch", tenant_id=tenant_id)
 
@@ -635,7 +652,7 @@ def test_get_tenant_size_with_multiple_branches(
     # tenant_size but so far this has been reliable, even though at least gc
     # and tenant_size race for the same locks
     size_after = http_client.tenant_size(tenant_id)
-    assert size_after == size_after_thinning_branch
+    assert_size_approx_equal(size_after, size_after_thinning_branch)
 
     size_debug_file_before = open(test_output_dir / "size_debug_before.html", "w")
     size_debug = http_client.tenant_size_debug(tenant_id)
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 677c0d18e8..40dff194aa 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -12,7 +12,6 @@ from fixtures.log_helper import log
 from fixtures.metrics import (
     PAGESERVER_GLOBAL_METRICS,
     PAGESERVER_PER_TENANT_METRICS,
-    PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     parse_metrics,
 )
 from fixtures.neon_fixtures import (
@@ -232,17 +231,10 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
         assert value
 
 
-@pytest.mark.parametrize(
-    "remote_storage_kind",
-    # exercise both the code paths where remote_storage=None and remote_storage=Some(...)
-    [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3],
-)
-def test_pageserver_metrics_removed_after_detach(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
-):
+def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilder):
     """Tests that when a tenant is detached, the tenant specific metrics are not left behind"""
 
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
 
     neon_env_builder.num_safekeepers = 3
 
@@ -282,9 +274,6 @@ def test_pageserver_metrics_removed_after_detach(
     for tenant in [tenant_1, tenant_2]:
         pre_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)])
         expected = set(PAGESERVER_PER_TENANT_METRICS)
-        if remote_storage_kind == RemoteStorageKind.NOOP:
-            # if there's no remote storage configured, we don't expose the remote timeline client metrics
-            expected -= set(PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS)
         assert pre_detach_samples == expected
 
         env.pageserver.http_client().tenant_detach(tenant)
@@ -294,9 +283,7 @@ def test_pageserver_metrics_removed_after_detach(
 
 
 # Check that empty tenants work with or without the remote storage
-@pytest.mark.parametrize(
-    "remote_storage_kind", available_remote_storages() + [RemoteStorageKind.NOOP]
-)
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
 def test_pageserver_with_empty_tenants(
     neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
 ):
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 0e4df21d83..3af144c31c 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -12,7 +12,6 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
     last_flush_lsn_upload,
-    wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
@@ -145,19 +144,12 @@ DELETE_FAILPOINTS = [
 def combinations():
     result = []
 
-    remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3]
+    remotes = [RemoteStorageKind.MOCK_S3]
     if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"):
         remotes.append(RemoteStorageKind.REAL_S3)
 
     for remote_storage_kind in remotes:
         for delete_failpoint in DELETE_FAILPOINTS:
-            if remote_storage_kind == RemoteStorageKind.NOOP and delete_failpoint in (
-                "timeline-delete-before-index-delete",
-                "timeline-delete-after-index-delete",
-            ):
-                # the above failpoints are not relevant for config without remote storage
-                continue
-
             result.append((remote_storage_kind, delete_failpoint))
     return result
 
@@ -205,23 +197,21 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
     with env.endpoints.create_start("delete") as endpoint:
         # generate enough layers
         run_pg_bench_small(pg_bin, endpoint.connstr())
-        if remote_storage_kind is RemoteStorageKind.NOOP:
-            wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline_id)
-        else:
-            last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
 
-            if remote_storage_kind in available_s3_storages():
-                assert_prefix_not_empty(
-                    neon_env_builder,
-                    prefix="/".join(
-                        (
-                            "tenants",
-                            str(env.initial_tenant),
-                            "timelines",
-                            str(timeline_id),
-                        )
-                    ),
-                )
+        last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
+
+        if remote_storage_kind in available_s3_storages():
+            assert_prefix_not_empty(
+                neon_env_builder,
+                prefix="/".join(
+                    (
+                        "tenants",
+                        str(env.initial_tenant),
+                        "timelines",
+                        str(timeline_id),
+                    )
+                ),
+            )
 
     env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}")
     # It appears when we stopped flush loop during deletion and then pageserver is stopped
@@ -807,6 +797,8 @@ def test_delete_orphaned_objects(
     reason = timeline_info["state"]["Broken"]["reason"]
     assert reason.endswith(f"failpoint: {failpoint}"), reason
 
+    ps_http.deletion_queue_flush(execute=True)
+
     for orphan in orphans:
         assert not orphan.exists()
         assert env.pageserver.log_contains(
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index f856b26c6e..c2e93c48c7 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -301,12 +301,8 @@ def test_timeline_initial_logical_size_calculation_cancellation(
     # message emitted by the code behind failpoint "timeline-calculate-logical-size-check-dir-exists"
 
 
-@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
-def test_timeline_physical_size_init(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
-):
-    if remote_storage_kind is not None:
-        neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+def test_timeline_physical_size_init(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
     env = neon_env_builder.init_start()
 
@@ -337,17 +333,12 @@ def test_timeline_physical_size_init(
     )
 
     assert_physical_size_invariants(
-        get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
-        remote_storage_kind,
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id),
     )
 
 
-@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
-def test_timeline_physical_size_post_checkpoint(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
-):
-    if remote_storage_kind is not None:
-        neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+def test_timeline_physical_size_post_checkpoint(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
     env = neon_env_builder.init_start()
 
@@ -369,19 +360,14 @@ def test_timeline_physical_size_post_checkpoint(
 
     def check():
         assert_physical_size_invariants(
-            get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
-            remote_storage_kind,
+            get_physical_size_values(env, env.initial_tenant, new_timeline_id),
         )
 
     wait_until(10, 1, check)
 
 
-@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
-def test_timeline_physical_size_post_compaction(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
-):
-    if remote_storage_kind is not None:
-        neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
     # Disable background compaction as we don't want it to happen after `get_physical_size` request
     # and before checking the expected size on disk, which makes the assertion failed
@@ -420,21 +406,15 @@ def test_timeline_physical_size_post_compaction(
     pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
     pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id)
 
-    if remote_storage_kind is not None:
-        wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)
+    wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)
 
     assert_physical_size_invariants(
-        get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
-        remote_storage_kind,
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id),
     )
 
 
-@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
-def test_timeline_physical_size_post_gc(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
-):
-    if remote_storage_kind is not None:
-        neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
     # Disable background compaction and GC as we don't want it to happen after `get_physical_size` request
     # and before checking the expected size on disk, which makes the assertion failed
@@ -471,12 +451,10 @@ def test_timeline_physical_size_post_gc(
     pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
     pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None)
 
-    if remote_storage_kind is not None:
-        wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)
+    wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)
 
     assert_physical_size_invariants(
-        get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
-        remote_storage_kind,
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id),
     )
 
 
@@ -560,14 +538,10 @@ def test_timeline_size_metrics(
     assert math.isclose(dbsize_sum, tl_logical_size_metric, abs_tol=2 * 1024 * 1024)
 
 
-@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
-def test_tenant_physical_size(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
-):
+def test_tenant_physical_size(neon_env_builder: NeonEnvBuilder):
     random.seed(100)
 
-    if remote_storage_kind is not None:
-        neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
     env = neon_env_builder.init_start()
 
@@ -575,12 +549,10 @@ def test_tenant_physical_size(
     client = env.pageserver.http_client()
 
     tenant, timeline = env.neon_cli.create_tenant()
-    if remote_storage_kind is not None:
-        wait_for_upload_queue_empty(pageserver_http, tenant, timeline)
 
     def get_timeline_resident_physical_size(timeline: TimelineId):
-        sizes = get_physical_size_values(env, tenant, timeline, remote_storage_kind)
-        assert_physical_size_invariants(sizes, remote_storage_kind)
+        sizes = get_physical_size_values(env, tenant, timeline)
+        assert_physical_size_invariants(sizes)
         return sizes.prometheus_resident_physical
 
     timeline_total_resident_physical_size = get_timeline_resident_physical_size(timeline)
@@ -600,8 +572,7 @@ def test_tenant_physical_size(
         wait_for_last_flush_lsn(env, endpoint, tenant, timeline)
         pageserver_http.timeline_checkpoint(tenant, timeline)
 
-        if remote_storage_kind is not None:
-            wait_for_upload_queue_empty(pageserver_http, tenant, timeline)
+        wait_for_upload_queue_empty(pageserver_http, tenant, timeline)
 
         timeline_total_resident_physical_size += get_timeline_resident_physical_size(timeline)
 
@@ -630,7 +601,6 @@ def get_physical_size_values(
     env: NeonEnv,
     tenant_id: TenantId,
     timeline_id: TimelineId,
-    remote_storage_kind: Optional[RemoteStorageKind],
 ) -> TimelinePhysicalSizeValues:
     res = TimelinePhysicalSizeValues()
 
@@ -646,12 +616,9 @@ def get_physical_size_values(
     res.prometheus_resident_physical = metrics.query_one(
         "pageserver_resident_physical_size", metrics_filter
     ).value
-    if remote_storage_kind is not None:
-        res.prometheus_remote_physical = metrics.query_one(
-            "pageserver_remote_physical_size", metrics_filter
-        ).value
-    else:
-        res.prometheus_remote_physical = None
+    res.prometheus_remote_physical = metrics.query_one(
+        "pageserver_remote_physical_size", metrics_filter
+    ).value
 
     detail = client.timeline_detail(
         tenant_id, timeline_id, include_timeline_dir_layer_file_size_sum=True
@@ -664,20 +631,15 @@ def get_physical_size_values(
     return res
 
 
-def assert_physical_size_invariants(
-    sizes: TimelinePhysicalSizeValues, remote_storage_kind: Optional[RemoteStorageKind]
-):
+def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
     # resident phyiscal size is defined as
     assert sizes.python_timelinedir_layerfiles_physical == sizes.prometheus_resident_physical
     assert sizes.python_timelinedir_layerfiles_physical == sizes.layer_map_file_size_sum
 
     # we don't do layer eviction, so, all layers are resident
     assert sizes.api_current_physical == sizes.prometheus_resident_physical
-    if remote_storage_kind is not None:
-        assert sizes.prometheus_resident_physical == sizes.prometheus_remote_physical
-        # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS
-    else:
-        assert sizes.prometheus_remote_physical is None
+    assert sizes.prometheus_resident_physical == sizes.prometheus_remote_physical
+    # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS
 
 
 # Timeline logical size initialization is an asynchronous background task that runs once,