docs: pageserver controller rfc

docs: sharding phase 1 RFC
clarifications
2026-05-17 13:10:38 +00:00 · 2023-09-29 18:24:24 +01:00 · 2023-09-29 18:20:13 +01:00 · 2023-09-27 10:38:52 +01:00 · 2023-09-27 10:38:52 +01:00 · 2023-09-27 10:38:41 +01:00
77 changed files with 1591 additions and 4824 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -834,7 +834,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.17.12
+      VM_BUILDER_VERSION: v0.17.11

    steps:
      - name: Checkout
--- a/2
+++ b/2
@@ -5,7 +5,7 @@
 /libs/remote_storage/ @neondatabase/storage
 /libs/safekeeper_api/ @neondatabase/safekeepers
 /libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
-/pageserver/ @neondatabase/storage
+/pageserver/ @neondatabase/compute @neondatabase/storage
 /pgxn/ @neondatabase/compute
 /proxy/ @neondatabase/proxy
 /safekeeper/ @neondatabase/safekeepers
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -158,17 +158,6 @@ dependencies = [
 "syn 1.0.109",
 ]

-[[package]]
-name = "async-channel"
-version = "1.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35"
-dependencies = [
- "concurrent-queue",
- "event-listener",
- "futures-core",
-]
-
 [[package]]
 name = "async-compression"
 version = "0.4.0"
@@ -1026,15 +1015,6 @@ dependencies = [
 "zstd",
 ]

-[[package]]
-name = "concurrent-queue"
-version = "2.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f057a694a54f12365049b0958a1685bb52d567f5593b355fbf685838e873d400"
-dependencies = [
- "crossbeam-utils",
-]
-
 [[package]]
 name = "const_format"
 version = "0.2.30"
@@ -1455,12 +1435,6 @@ dependencies = [
 "libc",
 ]

-[[package]]
-name = "event-listener"
-version = "2.5.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
-
 [[package]]
 name = "fail"
 version = "0.5.1"
@@ -1806,9 +1780,18 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"

 [[package]]
 name = "hermit-abi"
-version = "0.3.3"
+version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
+checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "hermit-abi"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"

 [[package]]
 name = "hex"
@@ -2070,7 +2053,7 @@ version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
 dependencies = [
- "hermit-abi",
+ "hermit-abi 0.3.1",
 "libc",
 "windows-sys 0.48.0",
 ]
@@ -2087,7 +2070,7 @@ version = "0.4.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
 dependencies = [
- "hermit-abi",
+ "hermit-abi 0.3.1",
 "io-lifetimes",
 "rustix 0.37.19",
 "windows-sys 0.48.0",
@@ -2461,11 +2444,11 @@ dependencies = [

 [[package]]
 name = "num_cpus"
-version = "1.16.0"
+version = "1.15.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
+checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b"
 dependencies = [
- "hermit-abi",
+ "hermit-abi 0.2.6",
 "libc",
 ]

@@ -2682,7 +2665,6 @@ name = "pageserver"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "async-channel",
 "async-compression",
 "async-stream",
 "async-trait",
@@ -3264,7 +3246,6 @@ dependencies = [
 "reqwest-tracing",
 "routerify",
 "rstest",
- "rustc-hash",
 "rustls",
 "rustls-pemfile",
 "scopeguard",
@@ -3436,7 +3417,6 @@ dependencies = [
 "metrics",
 "once_cell",
 "pin-project-lite",
- "rand",
 "scopeguard",
 "serde",
 "serde_json",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -107,7 +107,6 @@ reqwest-middleware = "0.2.0"
 reqwest-retry = "0.2.2"
 routerify = "3"
 rpds = "0.13"
-rustc-hash = "1.1.0"
 rustls = "0.21"
 rustls-pemfile = "1"
 rustls-split = "0.3"
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -614,11 +614,15 @@ RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgre

 #########################################################################################
 #
-# Layer "rust extensions" for older extension which hasn't been updated to `pgrx` yet
+# Layer "rust extensions"
 # This layer is used to build `pgx` deps
 #
+# FIXME: This needs to be updated to latest version of 'pgrx' (it was renamed from
+# 'pgx' to 'pgrx') for PostgreSQL 16. And that in turn requires bumping the pgx
+# dependency on all the rust extension that depend on it, too.
+#
 #########################################################################################
-FROM build-deps AS rust-extensions-build-pgx
+FROM build-deps AS rust-extensions-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN apt-get update && \
@@ -650,34 +654,6 @@ RUN case "${PG_VERSION}" in \

 USER root

-#########################################################################################
-#
-# Layer "rust extensions"
-# This layer is used to build `pgrx` deps
-#
-#########################################################################################
-FROM build-deps AS rust-extensions-build-pgrx
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-RUN apt-get update && \
-    apt-get install -y curl libclang-dev cmake && \
-    useradd -ms /bin/bash nonroot -b /home
-
-ENV HOME=/home/nonroot
-ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
-USER nonroot
-WORKDIR /home/nonroot
-ARG PG_VERSION
-
-RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
-    chmod +x rustup-init && \
-    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
-    rm rustup-init && \
-    cargo install --locked --version 0.10.2 cargo-pgrx && \
-    /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
-
-USER root
-
 #########################################################################################
 #
 # Layer "pg-jsonschema-pg-build"
@@ -685,7 +661,7 @@ USER root
 #
 #########################################################################################

-FROM rust-extensions-build-pgx AS pg-jsonschema-pg-build
+FROM rust-extensions-build AS pg-jsonschema-pg-build
 ARG PG_VERSION

 # caeab60d70b2fd3ae421ec66466a3abbb37b7ee6 made on 06/03/2023
@@ -714,7 +690,7 @@ RUN case "${PG_VERSION}" in \
 #
 #########################################################################################

-FROM rust-extensions-build-pgx AS pg-graphql-pg-build
+FROM rust-extensions-build AS pg-graphql-pg-build
 ARG PG_VERSION

 # b4988843647450a153439be367168ed09971af85 made on 22/02/2023 (from remove-pgx-contrib-spiext branch)
@@ -748,14 +724,24 @@ RUN case "${PG_VERSION}" in \
 #
 #########################################################################################

-FROM rust-extensions-build-pgrx AS pg-tiktoken-pg-build
+FROM rust-extensions-build AS pg-tiktoken-pg-build
 ARG PG_VERSION

-# 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
-RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
-    echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
+# 801f84f08c6881c8aa30f405fafbf00eec386a72 made on 10/03/2023
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        ;; \
+      "v16") \
+        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
+	;; \
+      *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \
+    echo "52f60ac800993a49aa8c609961842b611b6b1949717b69ce2ec9117117e16e4a pg_tiktoken.tar.gz" | sha256sum --check && \
    mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
-    cargo pgrx install --release && \
+    cargo pgx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control

 #########################################################################################
@@ -765,18 +751,24 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6
 #
 #########################################################################################

-FROM rust-extensions-build-pgrx AS pg-pgx-ulid-build
+FROM rust-extensions-build AS pg-pgx-ulid-build
 ARG PG_VERSION

-RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \
-    echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        ;; \
+      "v16") \
+        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
+	;; \
+      *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
+    echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
-    echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \
-    wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
-    patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
-    echo "********************************************************************************************************" && \
-    sed -i 's/pgrx       = "=0.10.2"/pgrx = { version = "=0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    cargo pgrx install --release && \
+    sed -i 's/pgx        = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control

 #########################################################################################
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -223,7 +223,6 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
    if attach_req.pageserver_id.is_some() {
        tenant_state.generation += 1;
    }
-    tenant_state.pageserver = attach_req.pageserver_id;
    let generation = tenant_state.generation;

    locked.save().await.map_err(ApiError::InternalServerError)?;
--- a/deny.toml
+++ b/deny.toml
@@ -23,7 +23,7 @@ vulnerability = "deny"
 unmaintained = "warn"
 yanked = "warn"
 notice = "warn"
-ignore = []
+ignore = ["RUSTSEC-2023-0052"]

 # This section is considered when running `cargo deny check licenses`
 # More documentation for the licenses section can be found here:
--- a/docs/rfcs/029-sharding-phase1.md
+++ b/docs/rfcs/029-sharding-phase1.md
@@ -0,0 +1,244 @@
+# Sharding Phase 1: Static Key-space Sharding
+
+## Summary
+
+To enable databases with sizes approaching the capacity of a pageserver's disk,
+it is necessary to break up the storage for the database, or _shard_ it.
+
+Sharding in general is a complex area.  This RFC aims to define a modest initial
+capability that will permit creating large-capacity databases using a static configuration
+defined at time of Tenant creation.
+
+## Motivation
+
+Currently, all data for a Tenant, including all its timelines, is stored on a single
+pageserver.  The local storage required may be several times larger than the actual
+database size, due to LSM write inflation.
+
+If a database is larger than what one pageserver can hold, then it becomes impossible
+for the pageserver to hold it in local storage, as it must do to provide service to
+clients.
+
+### Prior art
+
+Numerous: sharding is a long-discussed feature for the pageserver.
+
+Prior art in other distributed systems is too broad to capture here: pretty much
+any scale out storage system does something like this.
+
+## Requirements
+
+- Enable creating a large (for example, 16TiB) database without requiring dedicated
+  pageserver nodes.
+- Share read/write bandwidth costs for large databases across pageservers, as well
+  as storage capacity, in order to avoid large capacity databases acting as I/O hotspots
+  that disrupt service to other tenants.
+- Our data distribution scheme should handle sparse/nonuniform keys well, since postgres
+  does not write out a single contiguous ranges of page numbers.
+
+*Note: the definition of 'large database' is arbitrary, but the lower bound is to ensure that a database
+that a user might create on a current-gen enterprise SSD should also work well on
+Neon.  The upper bound is whatever postgres can handle: i.e. we must make sure that the
+pageserver backend is not the limiting factor in the database size*.
+
+## Non Goals
+
+- Independently distributing timelines within the same tenant.  If a tenant has many
+  timelines, then sharding may be a less efficient mechanism for distributing load than
+  sharing out timelines between pageservers.
+- Distributing work in the LSN dimension: this RFC focuses on the Key dimension only,
+  based on the idea that separate mechanisms will make sense for each dimension.
+
+## Impacted Components
+
+pageserver, control plane, safekeeper (optional)
+
+## Terminology
+
+**Key**: a postgres page number.  In the sense that the pageserver is a versioned key-value store,
+the page number is the key in that store.
+
+**LSN dimension**: this just means the range of LSNs (history), when talking about the range
+of keys and LSNs as a two dimensional space.
+
+## Implementation
+
+### Key sharding vs. LSN sharding
+
+When we think of sharding across the two dimensional key/lsn space, this is an
+opportunity to think about how the two dimensions differ:
+- Sharding the key space distributes the _write_ workload of ingesting data
+  and compacting.  This work must be carefully managed so that exactly one
+  node owns a given key.
+- Sharding the LSN space distributes the _historical read_ workload.  This work
+  can be done by anyone without any special coordination, as long as they can
+  see the remote index and layers.
+
+The key sharding is the harder part, and also the more urgent one, to support larger
+capacity databases.  Because distributing historical LSN read work is a relatively
+simpler problem that most users don't have, we defer it to future work.  It is anticipated
+that some quite simple P2P offload model will enable distributing work for historical
+reads: a node which is low on space can call out to peer to ask it to download and
+serve reads from a historical layer.
+
+### Key mapping scheme
+
+Having decided to focus on key sharding, we must next decide how we will map
+keys to shards.
+
+It is proposed to use a "wide striping" approach, to obtain a good compromise
+between data locality and avoiding entire large relations mapping to the same shard.
+
+The mapping is quite simple:
+- Define a stripe size, such as 256MiB.  Map this to a key count, such that a contiguous
+  range of 256MiB keys would all fall into this stripe, i.e. divide by 8kiB to get 32k.
+- Map a key to a stripe by integer division.
+- Map a stripe to a shard by taking the shard index modulo the shard count.
+
+This scheme will achieve a good balance as long as there is no aliasing of the keys
+to the stripe width.  In the example above, if someone had 4 shards and wrote
+keys that were all 4*32k apart, they would all map to the same shard.  However, we do
+not have to worry about this, since end users do not control page numbers: as long as
+we do not pick stripe sizes that map to any problematic postgres behaviors, we'll be fine.
+
+### Important Types
+
+#### `ShardMap`
+
+Provides all the information needed to route a request for a particular
+key to the correct pageserver:
+- Stripe size
+- Shard count
+- Address of the pageserver hosting each shard
+
+This structure's size is linear with the number of shards.
+
+#### `ShardIdentity`
+
+Provides the information needed to know whether a particular key belongs
+to a particular shard:
+- Stripe size
+- Shard count
+- Shard index
+
+This structure's size is constant.
+
+### Pageserver changes
+
+Everywhere the Pageserver currently deals with Tenants, it will move to dealing with
+TenantShards, which are just a `Tenant` plus a `ShardIdentity` telling it which part
+of the keyspace it owns.
+
+When the pageserver subscribes to a safekeeper for WAL updates, it must provide
+its `ShardIdentity` to receive the relevant subset of the WAL.
+
+When the pageserver writes layers and index_part.json to remote storage, it must
+include the shard index & count in the name, to avoid collisions (the count is
+necessary for future-proofing: the count will vary in time).  These keys
+will also include a generation number: the [generation numbers](025-generation-numbers.md) system will work
+exactly the same for TenantShards as it does for Tenants today: each shard will have
+its own generation number.
+
+The pageserver doesn't have to do anything special during ingestion, compaction
+or GC.  It is implicitly operating on the subset of keys that map to its ShardIdentity.
+This will result in sparse layer files, containing keys only in the stripes that this
+shard owns.  Where optimizations currently exist in compaction for spotting "gaps" in
+the key range, these should be updated to ignore gaps that are due to sharding, to
+avoid spuriously splitting up layers ito stripe-sized pieces.
+
+### Pageserver Controller changes
+
+The pageserver controller is a new component, which is responsible for abstracting
+away the business of managing individual tenant placement on pagservers.  It will
+also act as the abstraction on top of sharding, so that the control plane continue
+to see a Tenant as a single object, even though the reality is that it is many
+TenantShards.
+
+For the rest of this RFC, think of the Pageserver Controller as a component of
+the control plane.  The actual implementation is beyond the scope of this RFC
+and will be described in more detail elsewhere.
+
+### Safekeeper changes
+
+The safekeeper's API for subscribing to a WAL will be extended to enable callers
+to provide a `ShardIdentity`.  In this mode it will only send WAL entries that
+fall within the keyspace belonging to the shard, and WAL entries that are to
+be mirrored to all shards.
+
+Metadata updates describing databases+relations are mirrored to
+all shards, and other WAL messages are only provided to the shard
+that owns the key being updated.  For any operation that updates multiple
+keys, it will be provided to all the shards whose key ranges intersect with
+one or more of the keys referenced in the WAL message.
+
+### Pageserver Controller
+
+### Endpoints
+
+Compute endpoints will need to:
+- Accept a ShardMap as part of their configuration from the control plane
+- Route pageserver requests according to that ShardMap
+
+### Control Plane
+
+#### Publishing ShardMap updates
+
+The control plane will provide an API for the pageserver controller to publish updates
+to the ShardMap for a tenant.  When such an update is provided, it will be used to
+update the configuration of any endpoints currently active for the tenant.
+
+The ShardMap will be opaque to the Control Plane: it doesn't need to do anything with it
+other than storing and passing on to endpoints.
+
+#### Attaching via the Pageserver Controller
+
+The Control Plane will issue attach/create API calls to the pageserver controller
+instead of directly to pageservers.  This will relieve the control plane of the need
+to know about sharding.
+
+#### Enabling sharding for large tenants
+
+When a Tenant is created, it is up to the control plane to provide a hint to
+the pageserver about how large it will be.  This may be implemented as a service tier,
+where users creating very large databases would be onboarded to the tier, and then
+the Tenants they create would be created with a larger number of shards.  For the
+general population of users we should continue to use 1 shard by default.
+
+## Next Steps
+
+Clearly, the mechanism described in this RFC has substantial limitations:
+- A) the number of shards in a tenant is defined at creation time.
+- B) data is not distributed across the LSN dimension
+
+To address `A`, a _splitting_ feature will later be added.  One shard can split its
+data into a number of children by doing a special compaction operation to generate
+image layers broken up child-shard-wise, and then writing out an index_part.json for
+each child.  This will then require coordination with the pageserver controller to
+safely attach these new child shards and then move them around to distribute work.
+The opposite _merging_ operation can also be imagined, but is unlikely to be implemented:
+once a Tenant has been sharded, there is little value in merging it again.
+
+To address `B`, it is envisaged to have some gossip mechanism for pageservers to communicate
+about their workload, and then a getpageatlsn offload mechanism where one pageserver can
+ask another to go read the necessary layers from remote storage to serve the read.  This
+requires relativly little coordination because it is read-only: any node can service any
+read.  All reads to a particular shard would still flow through one node, but the
+disk capactity & I/O impact of servicing the read would be distributed.
+
+## FAQ/Alternatives
+
+### Why stripe the data, rather than using contiguous ranges of keyspace for each shard?
+
+When a database is growing under a write workload, writes may predominantly hit the
+end of the keyspace, creating a bandwidth hotspot on that shard.  Similarly, if the user
+is intensively re-writing a particular relation, if that relation lived in a particular
+shard then it would not achieve our goal of distributing the write work across shards.
+
+### Why not proxy read requests through one pageserver, so that endpoints don't have to change?
+
+Two reasons:
+1. This would not achieve scale-out of network bandwidth: a busy tenant with a large
+   database would still cause a load hotspot on the pageserver routing its read requests. 
+2. Implementing a proxy model as a stop-gap would not be a cheap option, because
+   it requires making pageservers aware of their peers, and adding synchronisation to
+   keep pageservers aware of their peers as they come and go.
--- a/docs/rfcs/030-pageserver-controller-phase1.md
+++ b/docs/rfcs/030-pageserver-controller-phase1.md
@@ -0,0 +1,119 @@
+# Pageserver Controller Phase 1: Generations
+
+## Summary
+
+In the [generation numbers RFC](025-generation-numbers.md), it was proposed that
+the console/control plane would act as the central coordinator for issuing generation
+numbers.
+
+That approach has not proven practical, so this RFC proposes an alternative implementation
+where generation numbers are managed in a different service.
+
+Calls to generation-aware pageserver APIs like create/attach will call out to this
+new _pageserver controller_ to acquire generation numbers.  This service will also
+form the basis for satisfying future pageserver management requirements, such as
+coordinating sharding, doing automatic capacity balancing, and many more.
+
+## Motivation
+
+This is a dependency for delivering high availability.
+
+### Prior art
+
+None
+
+## Requirements
+
+- Provide a hook for the pageserver to use when it receives an attach/create/load API
+  call, which will yield a generation that is safe for the pageserver to use.
+- Implement the /re-attach and /validate APIs required for the generation numbers feature
+  to work. 
+
+## Non Goals
+
+- This is not intended to interact with any components other than the pageserver, or
+  to integrate with the broader control plane in any way.
+
+## Impacted Components
+
+pageserver, pageserver controller (new)
+
+## Implementation
+
+We may start from the minimal `attachment_service` used in automated tests.
+
+### Data store
+
+For generation numbers, we need a persistent, linearizable data store.  Postgres is sufficient for
+this: we already have postgres instances used for other control plane work.
+
+The storage for the Pageserver Controller will be independent of other components:
+it might use the same physical database server but would use an independent database.
+
+### Deployment
+
+There will be one instance per region.  In future we would aim to define the concept
+of a pageserver cluster and have one controller per cluster, but in the short term
+one per region will be functionally okay for current scale.
+
+The pageserver controller will be deployed within kubernetes, in the same way as
+the storage broker (which is currently via a [helm chart](https://github.com/neondatabase/helm-charts/tree/main/charts/neon-storage-broker)).
+
+### Security
+
+The pageserver controller's API will do authentication with JWT, the same as
+the pageserver's existing API.
+
+### Correctness
+
+It is essential that pageservers call into the controller at the _very start_ of
+handling attach/create/load API requests.  They should not do any work at all until
+they have acquired that generation number.
+
+If the call fails, they must retry: it is not safe to proceed without a generation number.
+
+## Future
+
+Having a call chain that goes `Control plane -> Pageserver -> Pageserver controller`
+is clearly a little strange: we are only doing this to avoid needing to make changes
+to the control plane.
+
+In future, we will change the control plane to call directly into the pageserver
+controller, which would then call onwards into the pageserver.  This would be a fairly
+small change to the controller, since all the logic around storing and updating
+generation numbers would stay the same: just the behavior of the API frontend
+would be different.
+
+The work to enable pageservers to communicate with the controller is not wasted,
+because they still communicate in that direction when invoking `/re-attach` 
+and `/validate`
+
+## Alternatives considered
+
+### Run in the console/control plane codebase
+
+The control plane is a large Go codebase that uses extensive code generation, and
+has to be quite generic to manage many different types of component.
+
+### Direct DB access
+
+We could have pageservers call directly into a shared database to acquire and update
+generation numbers (with carefully crafted transactions to protect against concurrent
+attaches getting the same generation, etc).   
+
+Pros:
+- No extra service required, simpler deployment
+
+Cons:
+- No future path to a cleaner architecture: the pageserver controller can be implemented
+  as an extensible place for implement more functionality in future, whereas a mechanism
+  to do generation numbers via SQL queries from the pageserver would be specialized
+  and the code would probably be disposed of in the relatively near future.
+- Puts onus entirely on SQL query correctness to mediate concurrent access.
+  The pageserver controller also has to be correct in this respect in case there
+  is more than one instance running, but it is much less likely to hit this path,
+  so the overall risk of issues is lower when using a central service.
+
+
+The main downside to that approach is that it doesn't provide the future path that
+the pageserver controller does
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -107,7 +107,7 @@ pub const CHUNK_SIZE: usize = 1000;

 // Just a wrapper around a slice of events
 // to serialize it as `{"events" : [ ] }
-#[derive(serde::Serialize, serde::Deserialize)]
+#[derive(serde::Serialize)]
 pub struct EventChunk<'a, T: Clone> {
    pub events: std::borrow::Cow<'a, [T]>,
 }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -363,15 +363,8 @@ pub struct TimelineInfo {
    pub latest_gc_cutoff_lsn: Lsn,
    #[serde_as(as = "DisplayFromStr")]
    pub disk_consistent_lsn: Lsn,
-
-    /// The LSN that we have succesfully uploaded to remote storage
    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn: Lsn,
-
-    /// The LSN that we are advertizing to safekeepers
-    #[serde_as(as = "DisplayFromStr")]
-    pub remote_consistent_lsn_visible: Lsn,
-
    pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
    /// Sum of the size of all layer files.
    /// If a layer is present in both local FS and S3, it counts only once.
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -29,4 +29,3 @@ workspace_hack.workspace = true
 [dev-dependencies]
 tempfile.workspace = true
 test-context.workspace = true
-rand.workspace = true
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -20,7 +20,6 @@ use std::{

 use anyhow::{bail, Context};

-use serde::{Deserialize, Serialize};
 use tokio::io;
 use toml_edit::Item;
 use tracing::info;
@@ -43,9 +42,6 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
 /// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;

-/// As defined in S3 docs
-pub const MAX_KEYS_PER_DELETE: usize = 1000;
-
 const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';

 /// Path on the remote storage, relative to some inner prefix.
@@ -54,25 +50,6 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct RemotePath(PathBuf);

-impl Serialize for RemotePath {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        serializer.collect_str(self)
-    }
-}
-
-impl<'de> Deserialize<'de> for RemotePath {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        let str = String::deserialize(deserializer)?;
-        Ok(Self(PathBuf::from(&str)))
-    }
-}
-
 impl std::fmt::Display for RemotePath {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.0.display())
@@ -111,10 +88,6 @@ impl RemotePath {
    pub fn extension(&self) -> Option<&str> {
        self.0.extension()?.to_str()
    }
-
-    pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Path, std::path::StripPrefixError> {
-        self.0.strip_prefix(&p.0)
-    }
 }

 /// Storage (potentially remote) API to manage its state.
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -33,10 +33,11 @@ use tracing::debug;

 use super::StorageMetadata;
 use crate::{
-    Download, DownloadError, RemotePath, RemoteStorage, S3Config, MAX_KEYS_PER_DELETE,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

+const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;
+
 pub(super) mod metrics;

 use self::metrics::{AttemptOutcome, RequestKind};
@@ -499,7 +500,7 @@ impl RemoteStorage for S3Bucket {
            delete_objects.push(obj_id);
        }

-        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
+        for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
            let started_at = start_measuring_requests(kind);

            let resp = self
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -378,30 +378,21 @@ impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
 fn create_s3_client(
    max_keys_per_list_response: Option<i32>,
 ) -> anyhow::Result<Arc<GenericRemoteStorage>> {
-    use rand::Rng;
-
    let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
        .context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
    let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
        .context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?;
-
-    // due to how time works, we've had test runners use the same nanos as bucket prefixes.
-    // millis is just a debugging aid for easier finding the prefix later.
-    let millis = std::time::SystemTime::now()
+    let random_prefix_part = std::time::SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .context("random s3 test prefix part calculation")?
-        .as_millis();
-
-    // because nanos can be the same for two threads so can millis, add randomness
-    let random = rand::thread_rng().gen::<u32>();
-
+        .as_nanos();
    let remote_storage_config = RemoteStorageConfig {
        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
        max_sync_errors: NonZeroU32::new(5).unwrap(),
        storage: RemoteStorageKind::AwsS3(S3Config {
            bucket_name: remote_storage_s3_bucket,
            bucket_region: remote_storage_s3_region,
-            prefix_in_bucket: Some(format!("test_{millis}_{random:08x}/")),
+            prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
            endpoint: None,
            concurrency_limit: NonZeroUsize::new(100).unwrap(),
            max_keys_per_list_response,
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -89,22 +89,6 @@ impl Generation {
            Self::Broken => panic!("Attempted to use a broken generation"),
        }
    }
-
-    pub fn next(&self) -> Generation {
-        match self {
-            Self::Valid(n) => Self::Valid(*n + 1),
-            Self::None => Self::Valid(1),
-            Self::Broken => panic!("Attempted to use a broken generation"),
-        }
-    }
-
-    pub fn into(self) -> Option<u32> {
-        if let Self::Valid(v) = self {
-            Some(v)
-        } else {
-            None
-        }
-    }
 }

 impl Serialize for Generation {
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -24,9 +24,6 @@ pub enum ApiError {
    #[error("Precondition failed: {0}")]
    PreconditionFailed(Box<str>),

-    #[error("Shutting down")]
-    ShuttingDown,
-
    #[error(transparent)]
    InternalServerError(anyhow::Error),
 }
@@ -55,10 +52,6 @@ impl ApiError {
                self.to_string(),
                StatusCode::PRECONDITION_FAILED,
            ),
-            ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status(
-                "Shutting down".to_string(),
-                StatusCode::SERVICE_UNAVAILABLE,
-            ),
            ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
                err.to_string(),
                StatusCode::INTERNAL_SERVER_ERROR,
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -216,24 +216,6 @@ impl std::fmt::Debug for PrettyLocation<'_, '_> {
    }
 }

-/// When you will store a secret but want to make sure it won't
-/// be accidentally logged, wrap it in a SecretString, whose Debug
-/// implementation does not expose the contents.
-#[derive(Clone, Eq, PartialEq)]
-pub struct SecretString(String);
-
-impl SecretString {
-    pub fn get_contents(&self) -> &str {
-        self.0.as_str()
-    }
-}
-
-impl std::fmt::Debug for SecretString {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "[SECRET]")
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use metrics::{core::Opts, IntCounterVec};
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -431,14 +431,14 @@ impl CgroupWatcher {
                            .context("failed to request upscale")?;

                        let memory_high =
-                            self.get_memory_high_bytes().context("failed to get memory.high")?;
+                            self.get_high_bytes().context("failed to get memory.high")?;
                        let new_high = memory_high + self.config.memory_high_increase_by_bytes;
                        info!(
                            current_high_bytes = memory_high,
                            new_high_bytes = new_high,
                            "updating memory.high"
                        );
-                        self.set_memory_high_bytes(new_high)
+                        self.set_high_bytes(new_high)
                            .context("failed to set memory.high")?;
                        last_memory_high_increase_at = Some(Instant::now());
                        continue;
@@ -556,6 +556,14 @@ impl CgroupWatcher {
    }
 }

+/// Represents a set of limits we apply to a cgroup to control memory usage.
+///
+/// Setting these values also affects the thresholds for receiving usage alerts.
+#[derive(Debug)]
+pub struct MemoryLimits {
+    pub high: u64,
+}
+
 // Methods for manipulating the actual cgroup
 impl CgroupWatcher {
    /// Get a handle on the freezer subsystem.
@@ -616,29 +624,50 @@ impl CgroupWatcher {
    }

    /// Set cgroup memory.high threshold.
-    pub fn set_memory_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
-        self.set_memory_high_internal(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64))
-    }
-
-    /// Set the cgroup's memory.high to 'max', disabling it.
-    pub fn unset_memory_high(&self) -> anyhow::Result<()> {
-        self.set_memory_high_internal(MaxValue::Max)
-    }
-
-    fn set_memory_high_internal(&self, value: MaxValue) -> anyhow::Result<()> {
+    pub fn set_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
        self.memory()
            .context("failed to get memory subsystem")?
            .set_mem(cgroups_rs::memory::SetMemory {
                low: None,
-                high: Some(value),
+                high: Some(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64)),
                min: None,
                max: None,
            })
-            .map_err(anyhow::Error::from)
+            .context("failed to set memory.high")
+    }
+
+    /// Set cgroup memory.high and memory.max.
+    pub fn set_limits(&self, limits: &MemoryLimits) -> anyhow::Result<()> {
+        info!(limits.high, path = self.path(), "writing new memory limits",);
+        self.memory()
+            .context("failed to get memory subsystem while setting memory limits")?
+            .set_mem(cgroups_rs::memory::SetMemory {
+                min: None,
+                low: None,
+                high: Some(MaxValue::Value(
+                    u64::min(limits.high, i64::MAX as u64) as i64
+                )),
+                max: None,
+            })
+            .context("failed to set memory limits")
+    }
+
+    /// Given some amount of available memory, set the desired cgroup memory limits
+    pub fn set_memory_limits(&mut self, available_memory: u64) -> anyhow::Result<()> {
+        let new_high = self.config.calculate_memory_high_value(available_memory);
+        let limits = MemoryLimits { high: new_high };
+        info!(
+            path = self.path(),
+            memory = ?limits,
+            "setting cgroup memory",
+        );
+        self.set_limits(&limits)
+            .context("failed to set cgroup memory limits")?;
+        Ok(())
    }

    /// Get memory.high threshold.
-    pub fn get_memory_high_bytes(&self) -> anyhow::Result<u64> {
+    pub fn get_high_bytes(&self) -> anyhow::Result<u64> {
        let high = self
            .memory()
            .context("failed to get memory subsystem while getting memory statistics")?
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -16,7 +16,7 @@ use tokio::sync::mpsc;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn};

-use crate::cgroup::{CgroupWatcher, Sequenced};
+use crate::cgroup::{CgroupWatcher, MemoryLimits, Sequenced};
 use crate::dispatcher::Dispatcher;
 use crate::filecache::{FileCacheConfig, FileCacheState};
 use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
@@ -106,51 +106,6 @@ impl Runner {
            kill,
        };

-        // If we have both the cgroup and file cache integrations enabled, it's possible for
-        // temporary failures to result in cgroup throttling (from memory.high), that in turn makes
-        // it near-impossible to connect to the file cache (because it times out). Unfortunately,
-        // we *do* still want to determine the file cache size before setting the cgroup's
-        // memory.high, so it's not as simple as just swapping the order.
-        //
-        // Instead, the resolution here is that on vm-monitor startup (note: happens on each
-        // connection from autoscaler-agent, possibly multiple times per compute_ctl lifecycle), we
-        // temporarily unset memory.high, to allow any existing throttling to dissipate. It's a bit
-        // of a hacky solution, but helps with reliability.
-        if let Some(name) = &args.cgroup {
-            // Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
-            // now, and then set limits later.
-            info!("initializing cgroup");
-
-            let (cgroup, cgroup_event_stream) = CgroupWatcher::new(name.clone(), requesting_send)
-                .context("failed to create cgroup manager")?;
-
-            info!("temporarily unsetting memory.high");
-
-            // Temporarily un-set cgroup memory.high; see above.
-            cgroup
-                .unset_memory_high()
-                .context("failed to unset memory.high")?;
-
-            let cgroup = Arc::new(cgroup);
-
-            let cgroup_clone = Arc::clone(&cgroup);
-            spawn_with_cancel(
-                token.clone(),
-                |_| error!("cgroup watcher terminated"),
-                async move { cgroup_clone.watch(notified_recv, cgroup_event_stream).await },
-            );
-
-            state.cgroup = Some(cgroup);
-        } else {
-            // *NOTE*: We need to forget the sender so that its drop impl does not get ran.
-            // This allows us to poll it in `Monitor::run` regardless of whether we
-            // are managing a cgroup or not. If we don't forget it, all receives will
-            // immediately return an error because the sender is droped and it will
-            // claim all select! statements, effectively turning `Monitor::run` into
-            // `loop { fail to receive }`.
-            mem::forget(requesting_send);
-        }
-
        let mut file_cache_reserved_bytes = 0;
        let mem = get_total_system_memory();

@@ -164,7 +119,7 @@ impl Runner {
                false => FileCacheConfig::default_in_memory(),
            };

-            let mut file_cache = FileCacheState::new(connstr, config, token)
+            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
                .await
                .context("failed to create file cache")?;

@@ -197,15 +152,35 @@ impl Runner {
            state.filecache = Some(file_cache);
        }

-        if let Some(cgroup) = &state.cgroup {
-            let available = mem - file_cache_reserved_bytes;
-            let value = cgroup.config.calculate_memory_high_value(available);
+        if let Some(name) = &args.cgroup {
+            let (mut cgroup, cgroup_event_stream) =
+                CgroupWatcher::new(name.clone(), requesting_send)
+                    .context("failed to create cgroup manager")?;

-            info!(value, "setting memory.high");
+            let available = mem - file_cache_reserved_bytes;

            cgroup
-                .set_memory_high_bytes(value)
-                .context("failed to set cgroup memory.high")?;
+                .set_memory_limits(available)
+                .context("failed to set cgroup memory limits")?;
+
+            let cgroup = Arc::new(cgroup);
+
+            // Some might call this . . . cgroup v2
+            let cgroup_clone = Arc::clone(&cgroup);
+
+            spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move {
+                cgroup_clone.watch(notified_recv, cgroup_event_stream).await
+            });
+
+            state.cgroup = Some(cgroup);
+        } else {
+            // *NOTE*: We need to forget the sender so that its drop impl does not get ran.
+            // This allows us to poll it in `Monitor::run` regardless of whether we
+            // are managing a cgroup or not. If we don't forget it, all receives will
+            // immediately return an error because the sender is droped and it will
+            // claim all select! statements, effectively turning `Monitor::run` into
+            // `loop { fail to receive }`.
+            mem::forget(requesting_send);
        }

        Ok(state)
@@ -282,11 +257,14 @@ impl Runner {
                new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
            }

-            // new_cgroup_mem_high is initialized to 0 but it is guaranteed to not be here
-            // since it is properly initialized in the previous cgroup if let block
+            let limits = MemoryLimits {
+                // new_cgroup_mem_high is initialized to 0 but it is guarancontextd to not be here
+                // since it is properly initialized in the previous cgroup if let block
+                high: new_cgroup_mem_high,
+            };
            cgroup
-                .set_memory_high_bytes(new_cgroup_mem_high)
-                .context("failed to set cgroup memory.high")?;
+                .set_limits(&limits)
+                .context("failed to set cgroup memory limits")?;

            let message = format!(
                "set cgroup memory.high to {} MiB, of new max {} MiB",
@@ -349,9 +327,12 @@ impl Runner {
                name = cgroup.path(),
                "updating cgroup memory.high",
            );
+            let limits = MemoryLimits {
+                high: new_cgroup_mem_high,
+            };
            cgroup
-                .set_memory_high_bytes(new_cgroup_mem_high)
-                .context("failed to set cgroup memory.high")?;
+                .set_limits(&limits)
+                .context("failed to set file cache size")?;
        }

        Ok(())
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -81,7 +81,6 @@ enumset.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 tempfile.workspace = true
-async-channel = "1.9.0"

 [dev-dependencies]
 criterion.workspace = true
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -8,7 +8,6 @@ use anyhow::{anyhow, Context};
 use clap::{Arg, ArgAction, Command};

 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
-use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
@@ -21,7 +20,6 @@ use metrics::set_build_info_metric;
 use pageserver::{
    config::{defaults::*, PageServerConf},
    context::{DownloadBehavior, RequestContext},
-    deletion_queue::DeletionQueue,
    http, page_cache, page_service, task_mgr,
    task_mgr::TaskKind,
    task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
@@ -348,22 +346,9 @@ fn start_pageserver(
        }
    };

-    // Top-level cancellation token for the process
-    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
-
    // Set up remote storage client
    let remote_storage = create_remote_storage_client(conf)?;

-    // Set up deletion queue
-    let (deletion_queue, deletion_workers) = DeletionQueue::new(
-        remote_storage.clone(),
-        ControlPlaneClient::new(conf, &shutdown_pageserver),
-        conf,
-    );
-    if let Some(deletion_workers) = deletion_workers {
-        deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
-    }
-
    // Up to this point no significant I/O has been done: this should have been fast.  Record
    // duration prior to starting I/O intensive phase of startup.
    startup_checkpoint("initial", "Starting loading tenants");
@@ -394,13 +379,13 @@ fn start_pageserver(
    };

    // Scan the local 'tenants/' directory and start loading the tenants
-    let deletion_queue_client = deletion_queue.new_client();
+    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
+
    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
        TenantSharedResources {
            broker_client: broker_client.clone(),
            remote_storage: remote_storage.clone(),
-            deletion_queue_client,
        },
        order,
        shutdown_pageserver.clone(),
@@ -496,10 +481,9 @@ fn start_pageserver(
            http::routes::State::new(
                conf,
                http_auth.clone(),
-                remote_storage.clone(),
+                remote_storage,
                broker_client.clone(),
                disk_usage_eviction_state,
-                deletion_queue.new_client(),
            )
            .context("Failed to initialize router state")?,
        );
@@ -605,31 +589,6 @@ fn start_pageserver(
        );
    }

-    task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
-        TaskKind::BackgroundRuntimeTurnaroundMeasure,
-        None,
-        None,
-        "background runtime turnaround measure",
-        true,
-        async move {
-            let server = hyper::Server::try_bind(&"0.0.0.0:2342".parse().unwrap()).expect("bind");
-            let server = server
-                .serve(hyper::service::make_service_fn(|_| async move {
-                    Ok::<_, std::convert::Infallible>(hyper::service::service_fn(
-                        move |_: hyper::Request<hyper::Body>| async move {
-                            Ok::<_, std::convert::Infallible>(hyper::Response::new(
-                                hyper::Body::from(format!("alive")),
-                            ))
-                        },
-                    ))
-                }))
-                .with_graceful_shutdown(task_mgr::shutdown_watcher());
-            server.await?;
-            Ok(())
-        },
-    );
-
    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());

    // All started up! Now just sit and wait for shutdown signal.
@@ -652,12 +611,7 @@ fn start_pageserver(
            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
            // The plan is to change that over time.
            shutdown_pageserver.take();
-            let bg_remote_storage = remote_storage.clone();
-            let bg_deletion_queue = deletion_queue.clone();
-            BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
-                bg_remote_storage.map(|_| bg_deletion_queue),
-                0,
-            ));
+            BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
            unreachable!()
        }
    })
@@ -669,7 +623,7 @@ fn create_remote_storage_client(
    let config = if let Some(config) = &conf.remote_storage_config {
        config
    } else {
-        tracing::warn!("no remote storage configured, this is a deprecated configuration");
+        // No remote storage configured.
        return Ok(None);
    };

--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -11,7 +11,6 @@ use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
-use utils::logging::SecretString;

 use once_cell::sync::OnceCell;
 use reqwest::Url;
@@ -208,9 +207,6 @@ pub struct PageServerConf {
    pub background_task_maximum_delay: Duration,

    pub control_plane_api: Option<Url>,
-
-    /// JWT token for use with the control plane API.
-    pub control_plane_api_token: Option<SecretString>,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -287,7 +283,6 @@ struct PageServerConfigBuilder {
    background_task_maximum_delay: BuilderValue<Duration>,

    control_plane_api: BuilderValue<Option<Url>>,
-    control_plane_api_token: BuilderValue<Option<SecretString>>,
 }

 impl Default for PageServerConfigBuilder {
@@ -352,7 +347,6 @@ impl Default for PageServerConfigBuilder {
            .unwrap()),

            control_plane_api: Set(None),
-            control_plane_api_token: Set(None),
        }
    }
 }
@@ -481,8 +475,8 @@ impl PageServerConfigBuilder {
        self.background_task_maximum_delay = BuilderValue::Set(delay);
    }

-    pub fn control_plane_api(&mut self, api: Option<Url>) {
-        self.control_plane_api = BuilderValue::Set(api)
+    pub fn control_plane_api(&mut self, api: Url) {
+        self.control_plane_api = BuilderValue::Set(Some(api))
    }

    pub fn build(self) -> anyhow::Result<PageServerConf> {
@@ -573,9 +567,6 @@ impl PageServerConfigBuilder {
            control_plane_api: self
                .control_plane_api
                .ok_or(anyhow!("missing control_plane_api"))?,
-            control_plane_api_token: self
-                .control_plane_api_token
-                .ok_or(anyhow!("missing control_plane_api_token"))?,
        })
    }
 }
@@ -589,27 +580,6 @@ impl PageServerConf {
        self.workdir.join(TENANTS_SEGMENT_NAME)
    }

-    pub fn deletion_prefix(&self) -> PathBuf {
-        self.workdir.join("deletion")
-    }
-
-    pub fn deletion_list_path(&self, sequence: u64) -> PathBuf {
-        // Encode a version in the filename, so that if we ever switch away from JSON we can
-        // increment this.
-        const VERSION: u8 = 1;
-
-        self.deletion_prefix()
-            .join(format!("{sequence:016x}-{VERSION:02x}.list"))
-    }
-
-    pub fn deletion_header_path(&self) -> PathBuf {
-        // Encode a version in the filename, so that if we ever switch away from JSON we can
-        // increment this.
-        const VERSION: u8 = 1;
-
-        self.deletion_prefix().join(format!("header-{VERSION:02x}"))
-    }
-
    pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf {
        self.tenants_path().join(tenant_id.to_string())
    }
@@ -777,14 +747,7 @@ impl PageServerConf {
                },
                "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
                "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
-                "control_plane_api" => {
-                    let parsed = parse_toml_string(key, item)?;
-                    if parsed.is_empty() {
-                        builder.control_plane_api(None)
-                    } else {
-                        builder.control_plane_api(Some(parsed.parse().context("failed to parse control plane URL")?))
-                    }
-                },
+                "control_plane_api" => builder.control_plane_api(parse_toml_string(key, item)?.parse().context("failed to parse control plane URL")?),
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -954,7 +917,6 @@ impl PageServerConf {
            ondemand_download_behavior_treat_error_as_warn: false,
            background_task_maximum_delay: Duration::ZERO,
            control_plane_api: None,
-            control_plane_api_token: None,
        }
    }
 }
@@ -1178,8 +1140,7 @@ background_task_maximum_delay = '334 s'
                background_task_maximum_delay: humantime::parse_duration(
                    defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
                )?,
-                control_plane_api: None,
-                control_plane_api_token: None
+                control_plane_api: None
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1235,8 +1196,7 @@ background_task_maximum_delay = '334 s'
                test_remote_failures: 0,
                ondemand_download_behavior_treat_error_as_warn: false,
                background_task_maximum_delay: Duration::from_secs(334),
-                control_plane_api: None,
-                control_plane_api_token: None
+                control_plane_api: None
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -1,9 +1,7 @@
 use std::collections::HashMap;

-use pageserver_api::control_api::{
-    ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
-};
-use serde::{de::DeserializeOwned, Serialize};
+use hyper::StatusCode;
+use pageserver_api::control_api::{ReAttachRequest, ReAttachResponse};
 use tokio_util::sync::CancellationToken;
 use url::Url;
 use utils::{
@@ -14,34 +12,25 @@ use utils::{

 use crate::config::PageServerConf;

+// Backoffs when control plane requests do not succeed: compromise between reducing load
+// on control plane, and retrying frequently when we are blocked on a control plane
+// response to make progress.
+const BACKOFF_INCREMENT: f64 = 0.1;
+const BACKOFF_MAX: f64 = 10.0;
+
 /// The Pageserver's client for using the control plane API: this is a small subset
 /// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
-pub struct ControlPlaneClient {
+pub(crate) struct ControlPlaneClient {
    http_client: reqwest::Client,
    base_url: Url,
    node_id: NodeId,
    cancel: CancellationToken,
 }

-/// Represent operations which internally retry on all errors other than
-/// cancellation token firing: the only way they can fail is ShuttingDown.
-pub enum RetryForeverError {
-    ShuttingDown,
-}
-
-#[async_trait::async_trait]
-pub trait ControlPlaneGenerationsApi {
-    async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError>;
-    async fn validate(
-        &self,
-        tenants: Vec<(TenantId, Generation)>,
-    ) -> Result<HashMap<TenantId, bool>, RetryForeverError>;
-}
-
 impl ControlPlaneClient {
    /// A None return value indicates that the input `conf` object does not have control
    /// plane API enabled.
-    pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
+    pub(crate) fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
        let mut url = match conf.control_plane_api.as_ref() {
            Some(u) => u.clone(),
            None => return None,
@@ -53,78 +42,39 @@ impl ControlPlaneClient {
            segs.pop_if_empty().push("");
        }

-        let mut client = reqwest::ClientBuilder::new();
-
-        if let Some(jwt) = &conf.control_plane_api_token {
-            let mut headers = hyper::HeaderMap::new();
-            headers.insert("Authorization", jwt.get_contents().parse().unwrap());
-            client = client.default_headers(headers);
-        }
+        let client = reqwest::ClientBuilder::new()
+            .build()
+            .expect("Failed to construct http client");

        Some(Self {
-            http_client: client.build().expect("Failed to construct HTTP client"),
+            http_client: client,
            base_url: url,
            node_id: conf.id,
            cancel: cancel.clone(),
        })
    }

-    async fn retry_http_forever<R, T>(
+    async fn try_re_attach(
        &self,
-        url: &url::Url,
-        request: R,
-    ) -> Result<T, RetryForeverError>
-    where
-        R: Serialize,
-        T: DeserializeOwned,
-    {
-        #[derive(thiserror::Error, Debug)]
-        enum RemoteAttemptError {
-            #[error("shutdown")]
-            Shutdown,
-            #[error("remote: {0}")]
-            Remote(reqwest::Error),
-        }
-
-        match backoff::retry(
-            || async {
-                let response = self
-                    .http_client
-                    .post(url.clone())
-                    .json(&request)
-                    .send()
-                    .await
-                    .map_err(RemoteAttemptError::Remote)?;
-
-                response
-                    .error_for_status_ref()
-                    .map_err(RemoteAttemptError::Remote)?;
-                response
-                    .json::<T>()
-                    .await
-                    .map_err(RemoteAttemptError::Remote)
-            },
-            |_| false,
-            3,
-            u32::MAX,
-            "calling control plane generation validation API",
-            backoff::Cancel::new(self.cancel.clone(), || RemoteAttemptError::Shutdown),
-        )
-        .await
-        {
-            Err(RemoteAttemptError::Shutdown) => Err(RetryForeverError::ShuttingDown),
-            Err(RemoteAttemptError::Remote(_)) => {
-                panic!("We retry forever, this should never be reached");
+        url: Url,
+        request: &ReAttachRequest,
+    ) -> anyhow::Result<ReAttachResponse> {
+        match self.http_client.post(url).json(request).send().await {
+            Err(e) => Err(anyhow::Error::from(e)),
+            Ok(r) => {
+                if r.status() == StatusCode::OK {
+                    r.json::<ReAttachResponse>()
+                        .await
+                        .map_err(anyhow::Error::from)
+                } else {
+                    Err(anyhow::anyhow!("Unexpected status {}", r.status()))
+                }
            }
-            Ok(r) => Ok(r),
        }
    }
-}

-#[async_trait::async_trait]
-impl ControlPlaneGenerationsApi for ControlPlaneClient {
-    /// Block until we get a successful response, or error out if we are shut down
-    async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
+    /// Block until we get a successful response
+    pub(crate) async fn re_attach(&self) -> anyhow::Result<HashMap<TenantId, Generation>> {
        let re_attach_path = self
            .base_url
            .join("re-attach")
@@ -133,47 +83,37 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
            node_id: self.node_id,
        };

-        let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
-        tracing::info!(
-            "Received re-attach response with {} tenants",
-            response.tenants.len()
-        );
+        let mut attempt = 0;
+        loop {
+            let result = self.try_re_attach(re_attach_path.clone(), &request).await;
+            match result {
+                Ok(res) => {
+                    tracing::info!(
+                        "Received re-attach response with {} tenants",
+                        res.tenants.len()
+                    );

-        Ok(response
-            .tenants
-            .into_iter()
-            .map(|t| (t.id, Generation::new(t.generation)))
-            .collect::<HashMap<_, _>>())
-    }
-
-    /// Block until we get a successful response, or error out if we are shut down
-    async fn validate(
-        &self,
-        tenants: Vec<(TenantId, Generation)>,
-    ) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
-        let re_attach_path = self
-            .base_url
-            .join("validate")
-            .expect("Failed to build validate path");
-
-        let request = ValidateRequest {
-            tenants: tenants
-                .into_iter()
-                .map(|(id, gen)| ValidateRequestTenant {
-                    id,
-                    gen: gen
-                        .into()
-                        .expect("Generation should always be valid for a Tenant doing deletions"),
-                })
-                .collect(),
-        };
-
-        let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
-
-        Ok(response
-            .tenants
-            .into_iter()
-            .map(|rt| (rt.id, rt.valid))
-            .collect())
+                    return Ok(res
+                        .tenants
+                        .into_iter()
+                        .map(|t| (t.id, Generation::new(t.generation)))
+                        .collect::<HashMap<_, _>>());
+                }
+                Err(e) => {
+                    tracing::error!("Error re-attaching tenants, retrying: {e:#}");
+                    backoff::exponential_backoff(
+                        attempt,
+                        BACKOFF_INCREMENT,
+                        BACKOFF_MAX,
+                        &self.cancel,
+                    )
+                    .await;
+                    if self.cancel.is_cancelled() {
+                        return Err(anyhow::anyhow!("Shutting down"));
+                    }
+                    attempt += 1;
+                }
+            }
+        }
    }
 }
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -1,156 +0,0 @@
-//! The deleter is the final stage in the deletion queue.  It accumulates remote
-//! paths to delete, and periodically executes them in batches of up to 1000
-//! using the DeleteObjects request.
-//!
-//! Its purpose is to increase efficiency of remote storage I/O by issuing a smaller
-//! number of full-sized DeleteObjects requests, rather than a larger number of
-//! smaller requests.
-
-use remote_storage::GenericRemoteStorage;
-use remote_storage::RemotePath;
-use remote_storage::MAX_KEYS_PER_DELETE;
-use std::time::Duration;
-use tokio_util::sync::CancellationToken;
-use tracing::info;
-use tracing::warn;
-
-use crate::metrics;
-
-use super::DeletionQueueError;
-use super::FlushOp;
-
-const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
-
-pub(super) enum DeleterMessage {
-    Delete(Vec<RemotePath>),
-    Flush(FlushOp),
-}
-
-/// Non-persistent deletion queue, for coalescing multiple object deletes into
-/// larger DeleteObjects requests.
-pub(super) struct Deleter {
-    // Accumulate up to 1000 keys for the next deletion operation
-    accumulator: Vec<RemotePath>,
-
-    rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
-
-    cancel: CancellationToken,
-    remote_storage: GenericRemoteStorage,
-}
-
-impl Deleter {
-    pub(super) fn new(
-        remote_storage: GenericRemoteStorage,
-        rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
-        cancel: CancellationToken,
-    ) -> Self {
-        Self {
-            remote_storage,
-            rx,
-            cancel,
-            accumulator: Vec::new(),
-        }
-    }
-
-    /// Wrap the remote `delete_objects` with a failpoint
-    async fn remote_delete(&self) -> Result<(), anyhow::Error> {
-        fail::fail_point!("deletion-queue-before-execute", |_| {
-            info!("Skipping execution, failpoint set");
-            metrics::DELETION_QUEUE
-                .remote_errors
-                .with_label_values(&["failpoint"])
-                .inc();
-            Err(anyhow::anyhow!("failpoint hit"))
-        });
-
-        self.remote_storage.delete_objects(&self.accumulator).await
-    }
-
-    /// Block until everything in accumulator has been executed
-    async fn flush(&mut self) -> Result<(), DeletionQueueError> {
-        while !self.accumulator.is_empty() && !self.cancel.is_cancelled() {
-            match self.remote_delete().await {
-                Ok(()) => {
-                    // Note: we assume that the remote storage layer returns Ok(()) if some
-                    // or all of the deleted objects were already gone.
-                    metrics::DELETION_QUEUE
-                        .keys_executed
-                        .inc_by(self.accumulator.len() as u64);
-                    info!(
-                        "Executed deletion batch {}..{}",
-                        self.accumulator
-                            .first()
-                            .expect("accumulator should be non-empty"),
-                        self.accumulator
-                            .last()
-                            .expect("accumulator should be non-empty"),
-                    );
-                    self.accumulator.clear();
-                }
-                Err(e) => {
-                    warn!("DeleteObjects request failed: {e:#}, will retry");
-                    metrics::DELETION_QUEUE
-                        .remote_errors
-                        .with_label_values(&["execute"])
-                        .inc();
-                }
-            };
-        }
-        if self.cancel.is_cancelled() {
-            // Expose an error because we may not have actually flushed everything
-            Err(DeletionQueueError::ShuttingDown)
-        } else {
-            Ok(())
-        }
-    }
-
-    pub(super) async fn background(&mut self) -> Result<(), DeletionQueueError> {
-        self.accumulator.reserve(MAX_KEYS_PER_DELETE);
-
-        loop {
-            if self.cancel.is_cancelled() {
-                return Err(DeletionQueueError::ShuttingDown);
-            }
-
-            let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
-                Ok(Some(m)) => m,
-                Ok(None) => {
-                    // All queue senders closed
-                    info!("Shutting down");
-                    return Err(DeletionQueueError::ShuttingDown);
-                }
-                Err(_) => {
-                    // Timeout, we hit deadline to execute whatever we have in hand.  These functions will
-                    // return immediately if no work is pending
-                    self.flush().await?;
-
-                    continue;
-                }
-            };
-
-            match msg {
-                DeleterMessage::Delete(mut list) => {
-                    while !list.is_empty() || self.accumulator.len() == MAX_KEYS_PER_DELETE {
-                        if self.accumulator.len() == MAX_KEYS_PER_DELETE {
-                            self.flush().await?;
-                            // If we have received this number of keys, proceed with attempting to execute
-                            assert_eq!(self.accumulator.len(), 0);
-                        }
-
-                        let available_slots = MAX_KEYS_PER_DELETE - self.accumulator.len();
-                        let take_count = std::cmp::min(available_slots, list.len());
-                        for path in list.drain(list.len() - take_count..) {
-                            self.accumulator.push(path);
-                        }
-                    }
-                }
-                DeleterMessage::Flush(flush_op) => {
-                    // If flush() errors, we drop the flush_op and the caller will get
-                    // an error recv()'ing their oneshot channel.
-                    self.flush().await?;
-                    flush_op.notify();
-                }
-            }
-        }
-    }
-}
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -1,487 +0,0 @@
-//! The list writer is the first stage in the deletion queue.  It accumulates
-//! layers to delete, and periodically writes out these layers into a persistent
-//! DeletionList.
-//!
-//! The purpose of writing DeletionLists is to decouple the decision to
-//! delete an object from the validation required to execute it: even if
-//! validation is not possible, e.g. due to a control plane outage, we can
-//! still persist our intent to delete an object, in a way that would
-//! survive a restart.
-//!
-//! DeletionLists are passed onwards to the Validator.
-
-use super::DeletionHeader;
-use super::DeletionList;
-use super::FlushOp;
-use super::ValidatorQueueMessage;
-
-use std::collections::HashMap;
-use std::fs::create_dir_all;
-use std::time::Duration;
-
-use regex::Regex;
-use remote_storage::RemotePath;
-use tokio_util::sync::CancellationToken;
-use tracing::debug;
-use tracing::info;
-use tracing::warn;
-use utils::generation::Generation;
-use utils::id::TenantId;
-use utils::id::TimelineId;
-
-use crate::config::PageServerConf;
-use crate::deletion_queue::TEMP_SUFFIX;
-use crate::metrics;
-use crate::tenant::remote_timeline_client::remote_layer_path;
-use crate::tenant::storage_layer::LayerFileName;
-
-// The number of keys in a DeletionList before we will proactively persist it
-// (without reaching a flush deadline).  This aims to deliver objects of the order
-// of magnitude 1MB when we are under heavy delete load.
-const DELETION_LIST_TARGET_SIZE: usize = 16384;
-
-// Ordinarily, we only flush to DeletionList periodically, to bound the window during
-// which we might leak objects from not flushing a DeletionList after
-// the objects are already unlinked from timeline metadata.
-const FRONTEND_DEFAULT_TIMEOUT: Duration = Duration::from_millis(10000);
-
-// If someone is waiting for a flush to DeletionList, only delay a little to accumulate
-// more objects before doing the flush.
-const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100);
-
-#[derive(Debug)]
-pub(super) struct DeletionOp {
-    pub(super) tenant_id: TenantId,
-    pub(super) timeline_id: TimelineId,
-    // `layers` and `objects` are both just lists of objects.  `layers` is used if you do not
-    // have a config object handy to project it to a remote key, and need the consuming worker
-    // to do it for you.
-    pub(super) layers: Vec<(LayerFileName, Generation)>,
-    pub(super) objects: Vec<RemotePath>,
-
-    /// The _current_ generation of the Tenant attachment in which we are enqueuing
-    /// this deletion.
-    pub(super) generation: Generation,
-}
-
-#[derive(Debug)]
-pub(super) struct RecoverOp {
-    pub(super) attached_tenants: HashMap<TenantId, Generation>,
-}
-
-#[derive(Debug)]
-pub(super) enum ListWriterQueueMessage {
-    Delete(DeletionOp),
-    // Wait until all prior deletions make it into a persistent DeletionList
-    Flush(FlushOp),
-    // Wait until all prior deletions have been executed (i.e. objects are actually deleted)
-    FlushExecute(FlushOp),
-    // Call once after re-attaching to control plane, to notify the deletion queue about
-    // latest attached generations & load any saved deletion lists from disk.
-    Recover(RecoverOp),
-}
-
-pub(super) struct ListWriter {
-    conf: &'static PageServerConf,
-
-    // Incoming frontend requests to delete some keys
-    rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
-
-    // Outbound requests to the backend to execute deletion lists we have composed.
-    tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
-
-    // The list we are currently building, contains a buffer of keys to delete
-    // and our next sequence number
-    pending: DeletionList,
-
-    // These FlushOps should notify the next time we flush
-    pending_flushes: Vec<FlushOp>,
-
-    // Worker loop is torn down when this fires.
-    cancel: CancellationToken,
-
-    // Safety guard to do recovery exactly once
-    recovered: bool,
-}
-
-impl ListWriter {
-    // Initially DeletionHeader.validated_sequence is zero.  The place we start our
-    // sequence numbers must be higher than that.
-    const BASE_SEQUENCE: u64 = 1;
-
-    pub(super) fn new(
-        conf: &'static PageServerConf,
-        rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
-        tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
-        cancel: CancellationToken,
-    ) -> Self {
-        Self {
-            pending: DeletionList::new(Self::BASE_SEQUENCE),
-            conf,
-            rx,
-            tx,
-            pending_flushes: Vec::new(),
-            cancel,
-            recovered: false,
-        }
-    }
-
-    /// Try to flush `list` to persistent storage
-    ///
-    /// This does not return errors, because on failure to flush we do not lose
-    /// any state: flushing will be retried implicitly on the next deadline
-    async fn flush(&mut self) {
-        if self.pending.is_empty() {
-            for f in self.pending_flushes.drain(..) {
-                f.notify();
-            }
-            return;
-        }
-
-        match self.pending.save(self.conf).await {
-            Ok(_) => {
-                info!(sequence = self.pending.sequence, "Stored deletion list");
-
-                for f in self.pending_flushes.drain(..) {
-                    f.notify();
-                }
-
-                // Take the list we've accumulated, replace it with a fresh list for the next sequence
-                let next_list = DeletionList::new(self.pending.sequence + 1);
-                let list = std::mem::replace(&mut self.pending, next_list);
-
-                if let Err(e) = self.tx.send(ValidatorQueueMessage::Delete(list)).await {
-                    // This is allowed to fail: it will only happen if the backend worker is shut down,
-                    // so we can just drop this on the floor.
-                    info!("Deletion list dropped, this is normal during shutdown ({e:#})");
-                }
-            }
-            Err(e) => {
-                metrics::DELETION_QUEUE.unexpected_errors.inc();
-                warn!(
-                    sequence = self.pending.sequence,
-                    "Failed to write deletion list, will retry later ({e:#})"
-                );
-            }
-        }
-    }
-
-    /// Load the header, to learn the sequence number up to which deletions
-    /// have been validated.  We will apply validated=true to DeletionLists
-    /// <= this sequence when loading them.
-    ///
-    /// It is not an error for the header to not exist: we return None, and
-    /// the caller should act as if validated_sequence is 0
-    async fn load_validated_sequence(&self) -> Result<Option<u64>, anyhow::Error> {
-        let header_path = self.conf.deletion_header_path();
-        match tokio::fs::read(&header_path).await {
-            Ok(header_bytes) => {
-                match serde_json::from_slice::<DeletionHeader>(&header_bytes) {
-                    Ok(h) => Ok(Some(h.validated_sequence)),
-                    Err(e) => {
-                        warn!(
-                            "Failed to deserialize deletion header, ignoring {}: {e:#}",
-                            header_path.display()
-                        );
-                        // This should never happen unless we make a mistake with our serialization.
-                        // Ignoring a deletion header is not consequential for correctnes because all deletions
-                        // are ultimately allowed to fail: worst case we leak some objects for the scrubber to clean up.
-                        metrics::DELETION_QUEUE.unexpected_errors.inc();
-                        Ok(None)
-                    }
-                }
-            }
-            Err(e) => {
-                if e.kind() == std::io::ErrorKind::NotFound {
-                    debug!(
-                        "Deletion header {} not found, first start?",
-                        header_path.display()
-                    );
-                    Ok(None)
-                } else {
-                    Err(anyhow::anyhow!(e))
-                }
-            }
-        }
-    }
-
-    async fn recover(
-        &mut self,
-        attached_tenants: HashMap<TenantId, Generation>,
-    ) -> Result<(), anyhow::Error> {
-        debug!(
-            "recovering with {} attached tenants",
-            attached_tenants.len()
-        );
-
-        // Load the header
-        let validated_sequence = self.load_validated_sequence().await?.unwrap_or(0);
-
-        self.pending.sequence = validated_sequence + 1;
-
-        let deletion_directory = self.conf.deletion_prefix();
-        let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
-            Ok(d) => d,
-            Err(e) => {
-                warn!(
-                    "Failed to open deletion list directory {}: {e:#}",
-                    deletion_directory.display(),
-                );
-
-                // Give up: if we can't read the deletion list directory, we probably can't
-                // write lists into it later, so the queue won't work.
-                return Err(e.into());
-            }
-        };
-
-        let list_name_pattern =
-            Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
-
-        let header_path = self.conf.deletion_header_path();
-        let mut seqs: Vec<u64> = Vec::new();
-        while let Some(dentry) = dir.next_entry().await? {
-            let file_name = dentry.file_name();
-            let dentry_str = file_name.to_string_lossy();
-
-            if Some(file_name.as_os_str()) == header_path.file_name() {
-                // Don't try and parse the header's name like a list
-                continue;
-            }
-
-            if dentry_str.ends_with(TEMP_SUFFIX) {
-                info!("Cleaning up temporary file {dentry_str}");
-                let absolute_path = deletion_directory.join(dentry.file_name());
-                if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
-                    // Non-fatal error: we will just leave the file behind but not
-                    // try and load it.
-                    warn!(
-                        "Failed to clean up temporary file {}: {e:#}",
-                        absolute_path.display()
-                    );
-                }
-
-                continue;
-            }
-
-            let file_name = dentry.file_name().to_owned();
-            let basename = file_name.to_string_lossy();
-            let seq_part = if let Some(m) = list_name_pattern.captures(&basename) {
-                m.name("sequence")
-                    .expect("Non optional group should be present")
-                    .as_str()
-            } else {
-                warn!("Unexpected key in deletion queue: {basename}");
-                metrics::DELETION_QUEUE.unexpected_errors.inc();
-                continue;
-            };
-
-            let seq: u64 = match u64::from_str_radix(seq_part, 16) {
-                Ok(s) => s,
-                Err(e) => {
-                    warn!("Malformed key '{basename}': {e}");
-                    metrics::DELETION_QUEUE.unexpected_errors.inc();
-                    continue;
-                }
-            };
-            seqs.push(seq);
-        }
-        seqs.sort();
-
-        // Start our next deletion list from after the last location validated by
-        // previous process lifetime, or after the last location found (it is updated
-        // below after enumerating the deletion lists)
-        self.pending.sequence = validated_sequence + 1;
-        if let Some(max_list_seq) = seqs.last() {
-            self.pending.sequence = std::cmp::max(self.pending.sequence, max_list_seq + 1);
-        }
-
-        for s in seqs {
-            let list_path = self.conf.deletion_list_path(s);
-
-            let list_bytes = tokio::fs::read(&list_path).await?;
-
-            let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
-                Ok(l) => l,
-                Err(e) => {
-                    // Drop the list on the floor: any objects it referenced will be left behind
-                    // for scrubbing to clean up.  This should never happen unless we have a serialization bug.
-                    warn!(sequence = s, "Failed to deserialize deletion list: {e}");
-                    metrics::DELETION_QUEUE.unexpected_errors.inc();
-                    continue;
-                }
-            };
-
-            if deletion_list.sequence <= validated_sequence {
-                // If the deletion list falls below valid_seq, we may assume that it was
-                // already validated the last time this pageserver ran.  Otherwise, we still
-                // load it, as it may still contain content valid in this generation.
-                deletion_list.validated = true;
-            } else {
-                // Special case optimization: if a tenant is still attached, and no other
-                // generation was issued to another node in the interval while we restarted,
-                // then we may treat deletion lists from the previous generation as if they
-                // belong to our currently attached generation, and proceed to validate & execute.
-                for (tenant_id, tenant_list) in &mut deletion_list.tenants {
-                    if let Some(attached_gen) = attached_tenants.get(tenant_id) {
-                        if attached_gen.previous() == tenant_list.generation {
-                            tenant_list.generation = *attached_gen;
-                        }
-                    }
-                }
-            }
-
-            info!(
-                validated = deletion_list.validated,
-                sequence = deletion_list.sequence,
-                "Recovered deletion list"
-            );
-
-            // We will drop out of recovery if this fails: it indicates that we are shutting down
-            // or the backend has panicked
-            metrics::DELETION_QUEUE
-                .keys_submitted
-                .inc_by(deletion_list.len() as u64);
-            self.tx
-                .send(ValidatorQueueMessage::Delete(deletion_list))
-                .await?;
-        }
-
-        info!(next_sequence = self.pending.sequence, "Replay complete");
-
-        Ok(())
-    }
-
-    /// This is the front-end ingest, where we bundle up deletion requests into DeletionList
-    /// and write them out, for later validation by the backend and execution by the executor.
-    pub(super) async fn background(&mut self) {
-        info!("Started deletion frontend worker");
-
-        // Synchronous, but we only do it once per process lifetime so it's tolerable
-        if let Err(e) = create_dir_all(&self.conf.deletion_prefix()) {
-            tracing::error!(
-                "Failed to create deletion list directory {}, deletions will not be executed ({e})",
-                self.conf.deletion_prefix().display()
-            );
-            metrics::DELETION_QUEUE.unexpected_errors.inc();
-            return;
-        }
-
-        while !self.cancel.is_cancelled() {
-            let timeout = if self.pending_flushes.is_empty() {
-                FRONTEND_DEFAULT_TIMEOUT
-            } else {
-                FRONTEND_FLUSHING_TIMEOUT
-            };
-
-            let msg = match tokio::time::timeout(timeout, self.rx.recv()).await {
-                Ok(Some(msg)) => msg,
-                Ok(None) => {
-                    // Queue sender destroyed, shutting down
-                    break;
-                }
-                Err(_) => {
-                    // Hit deadline, flush.
-                    self.flush().await;
-                    continue;
-                }
-            };
-
-            match msg {
-                ListWriterQueueMessage::Delete(op) => {
-                    assert!(
-                        self.recovered,
-                        "Cannot process deletions before recovery.  This is a bug."
-                    );
-
-                    debug!(
-                        "Delete: ingesting {} layers, {} other objects",
-                        op.layers.len(),
-                        op.objects.len()
-                    );
-
-                    let mut layer_paths = Vec::new();
-                    for (layer, generation) in op.layers {
-                        layer_paths.push(remote_layer_path(
-                            &op.tenant_id,
-                            &op.timeline_id,
-                            &layer,
-                            generation,
-                        ));
-                    }
-                    layer_paths.extend(op.objects);
-
-                    if !self.pending.push(
-                        &op.tenant_id,
-                        &op.timeline_id,
-                        op.generation,
-                        &mut layer_paths,
-                    ) {
-                        self.flush().await;
-                        let retry_succeeded = self.pending.push(
-                            &op.tenant_id,
-                            &op.timeline_id,
-                            op.generation,
-                            &mut layer_paths,
-                        );
-                        if !retry_succeeded {
-                            // Unexpected: after we flush, we should have
-                            // drained self.pending, so a conflict on
-                            // generation numbers should be impossible.
-                            tracing::error!(
-                                "Failed to enqueue deletions, leaking objects.  This is a bug."
-                            );
-                            metrics::DELETION_QUEUE.unexpected_errors.inc();
-                        }
-                    }
-                }
-                ListWriterQueueMessage::Flush(op) => {
-                    if self.pending.is_empty() {
-                        // Execute immediately
-                        debug!("Flush: No pending objects, flushing immediately");
-                        op.notify()
-                    } else {
-                        // Execute next time we flush
-                        debug!("Flush: adding to pending flush list for next deadline flush");
-                        self.pending_flushes.push(op);
-                    }
-                }
-                ListWriterQueueMessage::FlushExecute(op) => {
-                    debug!("FlushExecute: passing through to backend");
-                    // We do not flush to a deletion list here: the client sends a Flush before the FlushExecute
-                    if let Err(e) = self.tx.send(ValidatorQueueMessage::Flush(op)).await {
-                        info!("Can't flush, shutting down ({e})");
-                        // Caller will get error when their oneshot sender was dropped.
-                    }
-                }
-                ListWriterQueueMessage::Recover(op) => {
-                    if self.recovered {
-                        tracing::error!(
-                            "Deletion queue recovery called more than once.  This is a bug."
-                        );
-                        metrics::DELETION_QUEUE.unexpected_errors.inc();
-                        // Non-fatal: although this is a bug, since we did recovery at least once we may proceed.
-                        continue;
-                    }
-
-                    if let Err(e) = self.recover(op.attached_tenants).await {
-                        // This should only happen in truly unrecoverable cases, like the recovery finding that the backend
-                        // queue receiver has been dropped, or something is critically broken with
-                        // the local filesystem holding deletion lists.
-                        info!(
-                            "Deletion queue recover aborted, deletion queue will not proceed ({e})"
-                        );
-                        metrics::DELETION_QUEUE.unexpected_errors.inc();
-                        return;
-                    } else {
-                        self.recovered = true;
-                    }
-                }
-            }
-
-            if self.pending.len() > DELETION_LIST_TARGET_SIZE || !self.pending_flushes.is_empty() {
-                self.flush().await;
-            }
-        }
-        info!("Deletion queue shut down.");
-    }
-}
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -1,414 +0,0 @@
-//! The validator is responsible for validating DeletionLists for execution,
-//! based on whethe the generation in the DeletionList is still the latest
-//! generation for a tenant.
-//!
-//! The purpose of validation is to ensure split-brain safety in the cluster
-//! of pageservers: a deletion may only be executed if the tenant generation
-//! that originated it is still current.  See docs/rfcs/025-generation-numbers.md
-//! The purpose of accumulating lists before validating them is to reduce load
-//! on the control plane API by issuing fewer, larger requests.
-//!
-//! In addition to validating DeletionLists, the validator validates updates to remote_consistent_lsn
-//! for timelines: these are logically deletions because the safekeepers use remote_consistent_lsn
-//! to decide when old
-//!
-//! Deletions are passed onward to the Deleter.
-
-use std::collections::HashMap;
-use std::path::PathBuf;
-use std::sync::Arc;
-use std::time::Duration;
-
-use tokio_util::sync::CancellationToken;
-use tracing::debug;
-use tracing::info;
-use tracing::warn;
-
-use crate::config::PageServerConf;
-use crate::control_plane_client::ControlPlaneGenerationsApi;
-use crate::control_plane_client::RetryForeverError;
-use crate::metrics;
-
-use super::deleter::DeleterMessage;
-use super::DeletionHeader;
-use super::DeletionList;
-use super::DeletionQueueError;
-use super::FlushOp;
-use super::VisibleLsnUpdates;
-
-// After this length of time, do any validation work that is pending,
-// even if we haven't accumulated many keys to delete.
-//
-// This also causes updates to remote_consistent_lsn to be validated, even
-// if there were no deletions enqueued.
-const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
-
-// If we have received this number of keys, proceed with attempting to execute
-const AUTOFLUSH_KEY_COUNT: usize = 16384;
-
-#[derive(Debug)]
-pub(super) enum ValidatorQueueMessage {
-    Delete(DeletionList),
-    Flush(FlushOp),
-}
-pub(super) struct Validator<C>
-where
-    C: ControlPlaneGenerationsApi,
-{
-    conf: &'static PageServerConf,
-    rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
-    tx: tokio::sync::mpsc::Sender<DeleterMessage>,
-
-    // Client for calling into control plane API for validation of deletes
-    control_plane_client: Option<C>,
-
-    // DeletionLists which are waiting generation validation.  Not safe to
-    // execute until [`validate`] has processed them.
-    pending_lists: Vec<DeletionList>,
-
-    // DeletionLists which have passed validation and are ready to execute.
-    validated_lists: Vec<DeletionList>,
-
-    // Sum of all the lengths of lists in pending_lists
-    pending_key_count: usize,
-
-    // Lsn validation state: we read projected LSNs and write back visible LSNs
-    // after validation.  This is the LSN equivalent of `pending_validation_lists`:
-    // it is drained in [`validate`]
-    lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
-
-    // If we failed to rewrite a deletion list due to local filesystem I/O failure,
-    // we must remember that and refuse to advance our persistent validated sequence
-    // number past the failure.
-    list_write_failed: Option<u64>,
-
-    cancel: CancellationToken,
-}
-
-impl<C> Validator<C>
-where
-    C: ControlPlaneGenerationsApi,
-{
-    pub(super) fn new(
-        conf: &'static PageServerConf,
-        rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
-        tx: tokio::sync::mpsc::Sender<DeleterMessage>,
-        control_plane_client: Option<C>,
-        lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
-        cancel: CancellationToken,
-    ) -> Self {
-        Self {
-            conf,
-            rx,
-            tx,
-            control_plane_client,
-            lsn_table,
-            pending_lists: Vec::new(),
-            validated_lists: Vec::new(),
-            pending_key_count: 0,
-            list_write_failed: None,
-            cancel,
-        }
-    }
-    /// Process any outstanding validations of generations of pending LSN updates or pending
-    /// DeletionLists.
-    ///
-    /// Valid LSN updates propagate back to Timelines immediately, valid DeletionLists
-    /// go into the queue of ready-to-execute lists.
-    async fn validate(&mut self) -> Result<(), DeletionQueueError> {
-        let mut tenant_generations = HashMap::new();
-        for list in &self.pending_lists {
-            for (tenant_id, tenant_list) in &list.tenants {
-                // Note: DeletionLists are in logical time order, so generation always
-                // goes up.  By doing a simple insert() we will always end up with
-                // the latest generation seen for a tenant.
-                tenant_generations.insert(*tenant_id, tenant_list.generation);
-            }
-        }
-
-        let pending_lsn_updates = {
-            let mut lsn_table = self.lsn_table.write().expect("Lock should not be poisoned");
-            std::mem::take(&mut *lsn_table)
-        };
-        for (tenant_id, update) in &pending_lsn_updates.tenants {
-            let entry = tenant_generations
-                .entry(*tenant_id)
-                .or_insert(update.generation);
-            if update.generation > *entry {
-                *entry = update.generation;
-            }
-        }
-
-        if tenant_generations.is_empty() {
-            // No work to do
-            return Ok(());
-        }
-
-        let tenants_valid = if let Some(control_plane_client) = &self.control_plane_client {
-            match control_plane_client
-                .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
-                .await
-            {
-                Ok(tenants) => tenants,
-                Err(RetryForeverError::ShuttingDown) => {
-                    // The only way a validation call returns an error is when the cancellation token fires
-                    return Err(DeletionQueueError::ShuttingDown);
-                }
-            }
-        } else {
-            // Control plane API disabled.  In legacy mode we consider everything valid.
-            tenant_generations.keys().map(|k| (*k, true)).collect()
-        };
-
-        let mut validated_sequence: Option<u64> = None;
-
-        // Apply the validation results to the pending LSN updates
-        for (tenant_id, tenant_lsn_state) in pending_lsn_updates.tenants {
-            let validated_generation = tenant_generations
-                .get(&tenant_id)
-                .expect("Map was built from the same keys we're reading");
-
-            let valid = tenants_valid
-                .get(&tenant_id)
-                .copied()
-                // If the tenant was missing from the validation response, it has been deleted.
-                // The Timeline that requested the LSN update is probably already torn down,
-                // or will be torn down soon.  In this case, drop the update by setting valid=false.
-                .unwrap_or(false);
-
-            if valid && *validated_generation == tenant_lsn_state.generation {
-                for (_timeline_id, pending_lsn) in tenant_lsn_state.timelines {
-                    pending_lsn.result_slot.store(pending_lsn.projected);
-                }
-            } else {
-                // If we failed validation, then do not apply any of the projected updates
-                warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
-                metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
-            }
-        }
-
-        // Apply the validation results to the pending deletion lists
-        for list in &mut self.pending_lists {
-            // Filter the list based on whether the server responded valid: true.
-            // If a tenant is omitted in the response, it has been deleted, and we should
-            // proceed with deletion.
-            let mut mutated = false;
-            list.tenants.retain(|tenant_id, tenant| {
-                let validated_generation = tenant_generations
-                    .get(tenant_id)
-                    .expect("Map was built from the same keys we're reading");
-
-                // If the tenant was missing from the validation response, it has been deleted.
-                // This means that a deletion is valid, but also redundant since the tenant's
-                // objects should have already been deleted.  Treat it as invalid to drop the
-                // redundant deletion.
-                let valid = tenants_valid.get(tenant_id).copied().unwrap_or(false);
-
-                // A list is valid if it comes from the current _or previous_ generation.
-                // - The previous generation case is permitted due to how we store deletion lists locally:
-                // if we see the immediately previous generation in a locally stored deletion list,
-                // it proves that this node's disk was used for both current & previous generations,
-                // and therefore no other node was involved in between: the two generations may be
-                // logically treated as the same.
-                // - In that previous generation case, we rewrote it to the current generation
-                // in recover(), so the comparison here is simply an equality.
-
-                let this_list_valid = valid
-                    && (tenant.generation == *validated_generation);
-
-                if !this_list_valid {
-                    warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
-                    metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
-                    mutated = true;
-                }
-                this_list_valid
-            });
-            list.validated = true;
-
-            if mutated {
-                // Save the deletion list if we had to make changes due to stale generations.  The
-                // saved list is valid for execution.
-                if let Err(e) = list.save(self.conf).await {
-                    // Highly unexpected.  Could happen if e.g. disk full.
-                    // If we didn't save the trimmed list, it is _not_ valid to execute.
-                    warn!("Failed to save modified deletion list {list}: {e:#}");
-                    metrics::DELETION_QUEUE.unexpected_errors.inc();
-
-                    // Rather than have a complex retry process, just drop it and leak the objects,
-                    // scrubber will clean up eventually.
-                    list.tenants.clear(); // Result is a valid-but-empty list, which is a no-op for execution.
-
-                    // We must remember this failure, to prevent later writing out a header that
-                    // would imply the unwritable list was valid on disk.
-                    if self.list_write_failed.is_none() {
-                        self.list_write_failed = Some(list.sequence);
-                    }
-                }
-            }
-
-            validated_sequence = Some(list.sequence);
-        }
-
-        if let Some(validated_sequence) = validated_sequence {
-            if let Some(list_write_failed) = self.list_write_failed {
-                // Rare error case: we failed to write out a deletion list to excise invalid
-                // entries, so we cannot advance the header's valid sequence number past that point.
-                //
-                // In this state we will continue to validate, execute and delete deletion lists,
-                // we just cannot update the header.  It should be noticed and fixed by a human due to
-                // the nonzero value of our unexpected_errors metric.
-                warn!(
-                    sequence_number = list_write_failed,
-                    "Cannot write header because writing a deletion list failed earlier",
-                );
-            } else {
-                // Write the queue header to record how far validation progressed.  This avoids having
-                // to rewrite each DeletionList to set validated=true in it.
-                let header = DeletionHeader::new(validated_sequence);
-
-                // Drop result because the validated_sequence is an optimization.  If we fail to save it,
-                // then restart, we will drop some deletion lists, creating work for scrubber.
-                // The save() function logs a warning on error.
-                if let Err(e) = header.save(self.conf).await {
-                    warn!("Failed to write deletion queue header: {e:#}");
-                    metrics::DELETION_QUEUE.unexpected_errors.inc();
-                }
-            }
-        }
-
-        // Transfer the validated lists to the validated queue, for eventual execution
-        self.validated_lists.append(&mut self.pending_lists);
-
-        Ok(())
-    }
-
-    async fn cleanup_lists(&mut self, list_paths: Vec<PathBuf>) {
-        for list_path in list_paths {
-            debug!("Removing deletion list {}", list_path.display());
-
-            if let Err(e) = tokio::fs::remove_file(&list_path).await {
-                // Unexpected: we should have permissions and nothing else should
-                // be touching these files.  We will leave the file behind.  Subsequent
-                // pageservers will try and load it again: hopefully whatever storage
-                // issue (probably permissions) has been fixed by then.
-                tracing::error!("Failed to delete {}: {e:#}", list_path.display());
-                metrics::DELETION_QUEUE.unexpected_errors.inc();
-                break;
-            }
-        }
-    }
-
-    async fn flush(&mut self) -> Result<(), DeletionQueueError> {
-        tracing::debug!("Flushing with {} pending lists", self.pending_lists.len());
-
-        // Issue any required generation validation calls to the control plane
-        self.validate().await?;
-
-        // After successful validation, nothing is pending: any lists that
-        // made it through validation will be in validated_lists.
-        assert!(self.pending_lists.is_empty());
-        self.pending_key_count = 0;
-
-        tracing::debug!(
-            "Validation complete, have {} validated lists",
-            self.validated_lists.len()
-        );
-
-        // Return quickly if we have no validated lists to execute.  This avoids flushing the
-        // executor when an idle backend hits its autoflush interval
-        if self.validated_lists.is_empty() {
-            return Ok(());
-        }
-
-        // Drain `validated_lists` into the executor
-        let mut executing_lists = Vec::new();
-        for list in self.validated_lists.drain(..) {
-            let list_path = self.conf.deletion_list_path(list.sequence);
-            let objects = list.into_remote_paths();
-            self.tx
-                .send(DeleterMessage::Delete(objects))
-                .await
-                .map_err(|_| DeletionQueueError::ShuttingDown)?;
-            executing_lists.push(list_path);
-        }
-
-        self.flush_executor().await?;
-
-        // Erase the deletion lists whose keys have all be deleted from remote storage
-        self.cleanup_lists(executing_lists).await;
-
-        Ok(())
-    }
-
-    async fn flush_executor(&mut self) -> Result<(), DeletionQueueError> {
-        // Flush the executor, so that all the keys referenced by these deletion lists
-        // are actually removed from remote storage.  This is a precondition to deleting
-        // the deletion lists themselves.
-        let (flush_op, rx) = FlushOp::new();
-        self.tx
-            .send(DeleterMessage::Flush(flush_op))
-            .await
-            .map_err(|_| DeletionQueueError::ShuttingDown)?;
-
-        rx.await.map_err(|_| DeletionQueueError::ShuttingDown)
-    }
-
-    pub(super) async fn background(&mut self) {
-        tracing::info!("Started deletion backend worker");
-
-        while !self.cancel.is_cancelled() {
-            let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
-                Ok(Some(m)) => m,
-                Ok(None) => {
-                    // All queue senders closed
-                    info!("Shutting down");
-                    break;
-                }
-                Err(_) => {
-                    // Timeout, we hit deadline to execute whatever we have in hand.  These functions will
-                    // return immediately if no work is pending.
-                    match self.flush().await {
-                        Ok(()) => {}
-                        Err(DeletionQueueError::ShuttingDown) => {
-                            // If we are shutting down, then auto-flush can safely be skipped
-                        }
-                    }
-
-                    continue;
-                }
-            };
-
-            match msg {
-                ValidatorQueueMessage::Delete(list) => {
-                    if list.validated {
-                        // A pre-validated list may only be seen during recovery, if we are recovering
-                        // a DeletionList whose on-disk state has validated=true
-                        self.validated_lists.push(list)
-                    } else {
-                        self.pending_key_count += list.len();
-                        self.pending_lists.push(list);
-                    }
-
-                    if self.pending_key_count > AUTOFLUSH_KEY_COUNT {
-                        match self.flush().await {
-                            Ok(()) => {}
-                            Err(DeletionQueueError::ShuttingDown) => {
-                                // If we are shutting down, then auto-flush can safely be skipped
-                            }
-                        }
-                    }
-                }
-                ValidatorQueueMessage::Flush(op) => {
-                    match self.flush().await {
-                        Ok(()) => {
-                            op.notify();
-                        }
-                        Err(DeletionQueueError::ShuttingDown) => {
-                            // If we fail due to shutting down, we will just drop `op` to propagate that status.
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1093,9 +1093,6 @@ components:
        remote_consistent_lsn:
          type: string
          format: hex
-        remote_consistent_lsn_visible:
-          type: string
-          format: hex
        ancestor_timeline_id:
          type: string
          format: hex
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -5,7 +5,6 @@ use std::collections::HashMap;
 use std::sync::Arc;

 use anyhow::{anyhow, Context, Result};
-use futures::TryFutureExt;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
@@ -25,7 +24,6 @@ use super::models::{
    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
 };
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
@@ -36,7 +34,7 @@ use crate::tenant::mgr::{
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::timeline::Timeline;
-use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
+use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use utils::{
@@ -63,7 +61,6 @@ pub struct State {
    remote_storage: Option<GenericRemoteStorage>,
    broker_client: storage_broker::BrokerClientChannel,
    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
-    deletion_queue_client: DeletionQueueClient,
 }

 impl State {
@@ -73,7 +70,6 @@ impl State {
        remote_storage: Option<GenericRemoteStorage>,
        broker_client: storage_broker::BrokerClientChannel,
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
-        deletion_queue_client: DeletionQueueClient,
    ) -> anyhow::Result<Self> {
        let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
            .iter()
@@ -86,17 +82,8 @@ impl State {
            remote_storage,
            broker_client,
            disk_usage_eviction_state,
-            deletion_queue_client,
        })
    }
-
-    fn tenant_resources(&self) -> TenantSharedResources {
-        TenantSharedResources {
-            broker_client: self.broker_client.clone(),
-            remote_storage: self.remote_storage.clone(),
-            deletion_queue_client: self.deletion_queue_client.clone(),
-        }
-    }
 }

 #[inline(always)]
@@ -296,12 +283,7 @@ async fn build_timeline_info_common(
    };
    let current_physical_size = Some(timeline.layer_size_sum().await);
    let state = timeline.current_state();
-    let remote_consistent_lsn_projected = timeline
-        .get_remote_consistent_lsn_projected()
-        .unwrap_or(Lsn(0));
-    let remote_consistent_lsn_visible = timeline
-        .get_remote_consistent_lsn_visible()
-        .unwrap_or(Lsn(0));
+    let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));

    let walreceiver_status = timeline.walreceiver_status();

@@ -311,8 +293,7 @@ async fn build_timeline_info_common(
        ancestor_timeline_id,
        ancestor_lsn,
        disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
-        remote_consistent_lsn: remote_consistent_lsn_projected,
-        remote_consistent_lsn_visible,
+        remote_consistent_lsn,
        last_record_lsn,
        prev_record_lsn: Some(timeline.get_prev_record_lsn()),
        latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
@@ -511,23 +492,24 @@ async fn tenant_attach_handler(

    let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;

-    if state.remote_storage.is_none() {
+    if let Some(remote_storage) = &state.remote_storage {
+        mgr::attach_tenant(
+            state.conf,
+            tenant_id,
+            generation,
+            tenant_conf,
+            state.broker_client.clone(),
+            remote_storage.clone(),
+            &ctx,
+        )
+        .instrument(info_span!("tenant_attach", %tenant_id))
+        .await?;
+    } else {
        return Err(ApiError::BadRequest(anyhow!(
            "attach_tenant is not possible because pageserver was configured without remote storage"
        )));
    }

-    mgr::attach_tenant(
-        state.conf,
-        tenant_id,
-        generation,
-        tenant_conf,
-        state.tenant_resources(),
-        &ctx,
-    )
-    .instrument(info_span!("tenant_attach", %tenant_id))
-    .await?;
-
    json_response(StatusCode::ACCEPTED, ())
 }

@@ -588,7 +570,6 @@ async fn tenant_load_handler(
        generation,
        state.broker_client.clone(),
        state.remote_storage.clone(),
-        state.deletion_queue_client.clone(),
        &ctx,
    )
    .instrument(info_span!("load", %tenant_id))
@@ -930,7 +911,8 @@ async fn tenant_create_handler(
        tenant_conf,
        target_tenant_id,
        generation,
-        state.tenant_resources(),
+        state.broker_client.clone(),
+        state.remote_storage.clone(),
        &ctx,
    )
    .instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
@@ -1147,39 +1129,6 @@ async fn timeline_download_remote_layers_handler_get(
    json_response(StatusCode::OK, info)
 }

-async fn deletion_queue_flush(
-    r: Request<Body>,
-    cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let state = get_state(&r);
-
-    if state.remote_storage.is_none() {
-        // Nothing to do if remote storage is disabled.
-        return json_response(StatusCode::OK, ());
-    }
-
-    let execute = parse_query_param(&r, "execute")?.unwrap_or(false);
-
-    let flush = async {
-        if execute {
-            state.deletion_queue_client.flush_execute().await
-        } else {
-            state.deletion_queue_client.flush().await
-        }
-    }
-    // DeletionQueueError's only case is shutting down.
-    .map_err(|_| ApiError::ShuttingDown);
-
-    tokio::select! {
-        res = flush => {
-            res.map(|()| json_response(StatusCode::OK, ()))?
-        }
-        _ = cancel.cancelled() => {
-            Err(ApiError::ShuttingDown)
-        }
-    }
-}
-
 async fn active_timeline_of_active_tenant(
    tenant_id: TenantId,
    timeline_id: TimelineId,
@@ -1514,9 +1463,6 @@ pub fn make_router(
        .put("/v1/disk_usage_eviction/run", |r| {
            api_handler(r, disk_usage_eviction_run)
        })
-        .put("/v1/deletion_queue/flush", |r| {
-            api_handler(r, deletion_queue_flush)
-        })
        .put("/v1/tenant/:tenant_id/break", |r| {
            testing_api_handler("set tenant state to broken", r, handle_tenant_break)
        })
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -3,8 +3,7 @@ pub mod basebackup;
 pub mod config;
 pub mod consumption_metrics;
 pub mod context;
-pub mod control_plane_client;
-pub mod deletion_queue;
+mod control_plane_client;
 pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
@@ -28,7 +27,6 @@ pub mod failpoint_support;
 use std::path::Path;

 use crate::task_mgr::TaskKind;
-use deletion_queue::DeletionQueue;
 use tracing::info;

 /// Current storage format version
@@ -50,8 +48,8 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);

 pub use crate::metrics::preinitialize_metrics;

-#[tracing::instrument(skip_all, fields(%exit_code))]
-pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_code: i32) {
+#[tracing::instrument]
+pub async fn shutdown_pageserver(exit_code: i32) {
    use std::time::Duration;
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
@@ -79,11 +77,6 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
    )
    .await;

-    // Best effort to persist any outstanding deletions, to avoid leaking objects
-    if let Some(mut deletion_queue) = deletion_queue {
-        deletion_queue.shutdown(Duration::from_secs(5)).await;
-    }
-
    // Shut down the HTTP endpoint last, so that you can still check the server's
    // status while it's shutting down.
    // FIXME: We should probably stop accepting commands like attach/detach earlier.
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -264,46 +264,6 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
    },
 });

-pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "pageserver_page_cache_acquire_pinned_slot_seconds",
-        "Time spent acquiring a pinned slot in the page cache",
-        CRITICAL_OP_BUCKETS.into(),
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_page_cache_find_victim_iters_total",
-        "Counter for the number of iterations in the find_victim loop",
-    )
-    .expect("failed to define a metric")
-});
-
-static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "page_cache_errors_total",
-        "Number of timeouts while acquiring a pinned slot in the page cache",
-        &["error_kind"]
-    )
-    .expect("failed to define a metric")
-});
-
-#[derive(IntoStaticStr)]
-#[strum(serialize_all = "kebab_case")]
-pub(crate) enum PageCacheErrorKind {
-    AcquirePinnedSlotTimeout,
-    EvictIterLimit,
-}
-
-pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) {
-    PAGE_CACHE_ERRORS
-        .get_metric_with_label_values(&[error_kind.into()])
-        .unwrap()
-        .inc();
-}
-
 pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wait_lsn_seconds",
@@ -331,14 +291,6 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
-    register_uint_gauge!(
-        "pageserver_resident_physical_size_global",
-        "Like `pageserver_resident_physical_size`, but without tenant/timeline dimensions."
-    )
-    .expect("failed to define a metric")
-});
-
 static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_remote_physical_size",
@@ -349,14 +301,6 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-static REMOTE_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
-    register_uint_gauge!(
-        "pageserver_remote_physical_size_global",
-        "Like `pageserver_remote_physical_size`, but without tenant/timeline dimensions."
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_remote_ondemand_downloaded_layers_total",
@@ -943,54 +887,6 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy
    .expect("failed to define a metric")
 });

-pub(crate) struct DeletionQueueMetrics {
-    pub(crate) keys_submitted: IntCounter,
-    pub(crate) keys_dropped: IntCounter,
-    pub(crate) keys_executed: IntCounter,
-    pub(crate) dropped_lsn_updates: IntCounter,
-    pub(crate) unexpected_errors: IntCounter,
-    pub(crate) remote_errors: IntCounterVec,
-}
-pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
-    DeletionQueueMetrics{
-
-    keys_submitted: register_int_counter!(
-        "pageserver_deletion_queue_submitted_total",
-        "Number of objects submitted for deletion"
-    )
-    .expect("failed to define a metric"),
-
-    keys_dropped: register_int_counter!(
-        "pageserver_deletion_queue_dropped_total",
-        "Number of object deletions dropped due to stale generation."
-    )
-    .expect("failed to define a metric"),
-
-    keys_executed: register_int_counter!(
-        "pageserver_deletion_queue_executed_total",
-        "Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed."
-    )
-    .expect("failed to define a metric"),
-
-    dropped_lsn_updates: register_int_counter!(
-        "pageserver_deletion_queue_dropped_lsn_updates_total",
-        "Updates to remote_consistent_lsn dropped due to stale generation number."
-    )
-    .expect("failed to define a metric"),
-    unexpected_errors: register_int_counter!(
-        "pageserver_deletion_queue_unexpected_errors_total",
-        "Number of unexpected condiions that may stall the queue: any value above zero is unexpected."
-    )
-    .expect("failed to define a metric"),
-    remote_errors: register_int_counter_vec!(
-        "pageserver_deletion_queue_remote_errors_total",
-        "Retryable remote I/O errors while executing deletions, for example 503 responses to DeleteObjects",
-        &["op_kind"],
-    )
-    .expect("failed to define a metric")
-}
-});
-
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
    Upload,
@@ -1265,7 +1161,7 @@ pub struct TimelineMetrics {
    pub load_layer_map_histo: StorageTimeMetrics,
    pub garbage_collect_histo: StorageTimeMetrics,
    pub last_record_gauge: IntGauge,
-    resident_physical_size_gauge: UIntGauge,
+    pub resident_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
    pub num_persistent_files_created: IntCounter,
@@ -1343,29 +1239,10 @@ impl TimelineMetrics {
    }

    pub fn record_new_file_metrics(&self, sz: u64) {
-        self.resident_physical_size_add(sz);
+        self.resident_physical_size_gauge.add(sz);
        self.num_persistent_files_created.inc_by(1);
        self.persistent_bytes_written.inc_by(sz);
    }
-
-    pub fn resident_physical_size_sub(&self, sz: u64) {
-        self.resident_physical_size_gauge.sub(sz);
-        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
-    }
-
-    pub fn resident_physical_size_add(&self, sz: u64) {
-        self.resident_physical_size_gauge.add(sz);
-        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
-    }
-
-    pub fn resident_physical_size_set(&self, sz: u64) {
-        self.resident_physical_size_gauge.set(sz);
-        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.set(sz);
-    }
-
-    pub fn resident_physical_size_get(&self) -> u64 {
-        self.resident_physical_size_gauge.get()
-    }
 }

 impl Drop for TimelineMetrics {
@@ -1373,10 +1250,7 @@ impl Drop for TimelineMetrics {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
-        {
-            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
-            let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
-        }
+        let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
@@ -1430,43 +1304,10 @@ use std::time::{Duration, Instant};
 use crate::context::{PageContentKind, RequestContext};
 use crate::task_mgr::TaskKind;

-/// Maintain a per timeline gauge in addition to the global gauge.
-struct PerTimelineRemotePhysicalSizeGauge {
-    last_set: u64,
-    gauge: UIntGauge,
-}
-
-impl PerTimelineRemotePhysicalSizeGauge {
-    fn new(per_timeline_gauge: UIntGauge) -> Self {
-        Self {
-            last_set: per_timeline_gauge.get(),
-            gauge: per_timeline_gauge,
-        }
-    }
-    fn set(&mut self, sz: u64) {
-        self.gauge.set(sz);
-        if sz < self.last_set {
-            REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set - sz);
-        } else {
-            REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - self.last_set);
-        };
-        self.last_set = sz;
-    }
-    fn get(&self) -> u64 {
-        self.gauge.get()
-    }
-}
-
-impl Drop for PerTimelineRemotePhysicalSizeGauge {
-    fn drop(&mut self) {
-        REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set);
-    }
-}
-
 pub struct RemoteTimelineClientMetrics {
    tenant_id: String,
    timeline_id: String,
-    remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
+    remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
    calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
    bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
    bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
@@ -1484,24 +1325,18 @@ impl RemoteTimelineClientMetrics {
        }
    }

-    pub(crate) fn remote_physical_size_set(&self, sz: u64) {
+    pub fn remote_physical_size_gauge(&self) -> UIntGauge {
        let mut guard = self.remote_physical_size_gauge.lock().unwrap();
-        let gauge = guard.get_or_insert_with(|| {
-            PerTimelineRemotePhysicalSizeGauge::new(
+        guard
+            .get_or_insert_with(|| {
                REMOTE_PHYSICAL_SIZE
                    .get_metric_with_label_values(&[
                        &self.tenant_id.to_string(),
                        &self.timeline_id.to_string(),
                    ])
-                    .unwrap(),
-            )
-        });
-        gauge.set(sz);
-    }
-
-    pub(crate) fn remote_physical_size_get(&self) -> u64 {
-        let guard = self.remote_physical_size_gauge.lock().unwrap();
-        guard.as_ref().map(|gauge| gauge.get()).unwrap_or(0)
+                    .unwrap()
+            })
+            .clone()
    }

    pub fn remote_operation_time(
@@ -1840,9 +1675,6 @@ pub fn preinitialize_metrics() {
        Lazy::force(c);
    });

-    // Deletion queue stats
-    Lazy::force(&DELETION_QUEUE);
-
    // countervecs
    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
        .into_iter()
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -75,12 +75,7 @@
 use std::{
    collections::{hash_map::Entry, HashMap},
    convert::TryInto,
-    sync::{
-        atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
-        Arc, Weak,
-    },
-    task::Poll,
-    time::Duration,
+    sync::atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
 };

 use anyhow::Context;
@@ -170,8 +165,6 @@ struct Slot {

 struct SlotInner {
    key: Option<CacheKey>,
-    // for `coalesce_readers_permit`
-    permit: std::sync::Mutex<Weak<PinnedSlotsPermit>>,
    buf: &'static mut [u8; PAGE_SZ],
 }

@@ -214,22 +207,6 @@ impl Slot {
    }
 }

-impl SlotInner {
-    /// If there is aready a reader, drop our permit and share its permit, just like we share read access.
-    fn coalesce_readers_permit(&self, permit: PinnedSlotsPermit) -> Arc<PinnedSlotsPermit> {
-        let mut guard = self.permit.lock().unwrap();
-        if let Some(existing_permit) = guard.upgrade() {
-            drop(guard);
-            drop(permit);
-            existing_permit
-        } else {
-            let permit = Arc::new(permit);
-            *guard = Arc::downgrade(&permit);
-            permit
-        }
-    }
-}
-
 pub struct PageCache {
    /// This contains the mapping from the cache key to buffer slot that currently
    /// contains the page, if any.
@@ -247,42 +224,30 @@ pub struct PageCache {
    /// The actual buffers with their metadata.
    slots: Box<[Slot]>,

-    pinned_slots: Arc<tokio::sync::Semaphore>,
-
    /// Index of the next candidate to evict, for the Clock replacement algorithm.
    /// This is interpreted modulo the page cache size.
    next_evict_slot: AtomicUsize,

-    find_victim_sender:
-        async_channel::Sender<(usize, tokio::sync::RwLockWriteGuard<'static, SlotInner>)>,
-    find_victim_waiters:
-        async_channel::Receiver<(usize, tokio::sync::RwLockWriteGuard<'static, SlotInner>)>,
-
    size_metrics: &'static PageCacheSizeMetrics,
 }

-struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
-
 ///
 /// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
 /// until the guard is dropped.
 ///
-pub struct PageReadGuard<'i> {
-    _permit: Arc<PinnedSlotsPermit>,
-    slot_guard: tokio::sync::RwLockReadGuard<'i, SlotInner>,
-}
+pub struct PageReadGuard<'i>(tokio::sync::RwLockReadGuard<'i, SlotInner>);

 impl std::ops::Deref for PageReadGuard<'_> {
    type Target = [u8; PAGE_SZ];

    fn deref(&self) -> &Self::Target {
-        self.slot_guard.buf
+        self.0.buf
    }
 }

 impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
    fn as_ref(&self) -> &[u8; PAGE_SZ] {
-        self.slot_guard.buf
+        self.0.buf
    }
 }

@@ -297,23 +262,16 @@ impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
 /// to initialize.
 ///
 pub struct PageWriteGuard<'i> {
-    state: PageWriteGuardState<'i>,
-}
+    inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,

-enum PageWriteGuardState<'i> {
-    Invalid {
-        inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
-        _permit: PinnedSlotsPermit,
-    },
-    Downgraded,
+    // Are the page contents currently valid?
+    // Used to mark pages as invalid that are assigned but not yet filled with data.
+    valid: bool,
 }

 impl std::ops::DerefMut for PageWriteGuard<'_> {
    fn deref_mut(&mut self) -> &mut Self::Target {
-        match &mut self.state {
-            PageWriteGuardState::Invalid { inner, _permit } => &mut inner.buf,
-            PageWriteGuardState::Downgraded => unreachable!(),
-        }
+        self.inner.buf
    }
 }

@@ -321,37 +279,25 @@ impl std::ops::Deref for PageWriteGuard<'_> {
    type Target = [u8; PAGE_SZ];

    fn deref(&self) -> &Self::Target {
-        match &self.state {
-            PageWriteGuardState::Invalid { inner, _permit } => &inner.buf,
-            PageWriteGuardState::Downgraded => unreachable!(),
-        }
+        self.inner.buf
    }
 }

 impl AsMut<[u8; PAGE_SZ]> for PageWriteGuard<'_> {
    fn as_mut(&mut self) -> &mut [u8; PAGE_SZ] {
-        match &mut self.state {
-            PageWriteGuardState::Invalid { inner, _permit } => &mut inner.buf,
-            PageWriteGuardState::Downgraded => todo!(),
-        }
+        self.inner.buf
    }
 }

-impl<'a> PageWriteGuard<'a> {
+impl PageWriteGuard<'_> {
    /// Mark that the buffer contents are now valid.
-    #[must_use]
-    pub fn mark_valid(mut self) -> PageReadGuard<'a> {
-        let prev = std::mem::replace(&mut self.state, PageWriteGuardState::Downgraded);
-        match prev {
-            PageWriteGuardState::Invalid { inner, _permit } => {
-                assert!(inner.key.is_some());
-                PageReadGuard {
-                    _permit: Arc::new(_permit),
-                    slot_guard: inner.downgrade(),
-                }
-            }
-            PageWriteGuardState::Downgraded => unreachable!(),
-        }
+    pub fn mark_valid(&mut self) {
+        assert!(self.inner.key.is_some());
+        assert!(
+            !self.valid,
+            "mark_valid called on a buffer that was already valid"
+        );
+        self.valid = true;
    }
 }

@@ -362,13 +308,11 @@ impl Drop for PageWriteGuard<'_> {
    /// initializing it, remove the mapping from the page cache.
    ///
    fn drop(&mut self) {
-        match &mut self.state {
-            PageWriteGuardState::Invalid { inner, _permit } => {
-                let self_key = inner.key.as_ref().unwrap();
-                PAGE_CACHE.get().unwrap().remove_mapping(self_key);
-                inner.key = None;
-            }
-            PageWriteGuardState::Downgraded => {}
+        assert!(self.inner.key.is_some());
+        if !self.valid {
+            let self_key = self.inner.key.as_ref().unwrap();
+            PAGE_CACHE.get().unwrap().remove_mapping(self_key);
+            self.inner.key = None;
        }
    }
 }
@@ -381,7 +325,7 @@ pub enum ReadBufResult<'a> {

 /// lock_for_write() return value
 pub enum WriteBufResult<'a> {
-    Found(PageReadGuard<'a>),
+    Found(PageWriteGuard<'a>),
    NotFound(PageWriteGuard<'a>),
 }

@@ -404,10 +348,6 @@ impl PageCache {
        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Option<(Lsn, PageReadGuard)> {
-        let Ok(permit) = self.try_get_pinned_slot_permit().await else {
-            return None;
-        };
-
        crate::metrics::PAGE_CACHE
            .for_ctx(ctx)
            .read_accesses_materialized_page
@@ -422,10 +362,7 @@ impl PageCache {
            lsn,
        };

-        if let Some(guard) = self
-            .try_lock_for_read(&mut cache_key, &mut Some(permit))
-            .await
-        {
+        if let Some(guard) = self.try_lock_for_read(&mut cache_key).await {
            if let CacheKey::MaterializedPage {
                hash_key: _,
                lsn: available_lsn,
@@ -455,7 +392,7 @@ impl PageCache {
    /// Store an image of the given page in the cache.
    ///
    pub async fn memorize_materialized_page(
-        &'static self,
+        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        key: Key,
@@ -472,15 +409,15 @@ impl PageCache {
        };

        match self.lock_for_write(&cache_key).await? {
-            WriteBufResult::Found(read_guard) => {
+            WriteBufResult::Found(write_guard) => {
                // We already had it in cache. Another thread must've put it there
                // concurrently. Check that it had the same contents that we
                // replayed.
-                assert!(*read_guard == img);
+                assert!(*write_guard == img);
            }
            WriteBufResult::NotFound(mut write_guard) => {
                write_guard.copy_from_slice(img);
-                let _ = write_guard.mark_valid();
+                write_guard.mark_valid();
            }
        }

@@ -490,7 +427,7 @@ impl PageCache {
    // Section 1.2: Public interface functions for working with immutable file pages.

    pub async fn read_immutable_buf(
-        &'static self,
+        &self,
        file_id: FileId,
        blkno: u32,
        ctx: &RequestContext,
@@ -508,16 +445,6 @@ impl PageCache {
    // "mappings" after this section. But the routines in this section should
    // not require changes.

-    async fn try_get_pinned_slot_permit(&self) -> anyhow::Result<PinnedSlotsPermit> {
-        let _timer = crate::metrics::PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME.start_timer();
-        Ok(PinnedSlotsPermit(
-            Arc::clone(&self.pinned_slots)
-                .acquire_owned()
-                .await
-                .unwrap(),
-        ))
-    }
-
    /// Look up a page in the cache.
    ///
    /// If the search criteria is not exact, *cache_key is updated with the key
@@ -527,11 +454,7 @@ impl PageCache {
    ///
    /// If no page is found, returns None and *cache_key is left unmodified.
    ///
-    async fn try_lock_for_read(
-        &self,
-        cache_key: &mut CacheKey,
-        permit: &mut Option<PinnedSlotsPermit>,
-    ) -> Option<PageReadGuard> {
+    async fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
        let cache_key_orig = cache_key.clone();
        if let Some(slot_idx) = self.search_mapping(cache_key) {
            // The page was found in the mapping. Lock the slot, and re-check
@@ -541,10 +464,7 @@ impl PageCache {
            let inner = slot.inner.read().await;
            if inner.key.as_ref() == Some(cache_key) {
                slot.inc_usage_count();
-                return Some(PageReadGuard {
-                    _permit: inner.coalesce_readers_permit(permit.take().unwrap()),
-                    slot_guard: inner,
-                });
+                return Some(PageReadGuard(inner));
            } else {
                // search_mapping might have modified the search key; restore it.
                *cache_key = cache_key_orig;
@@ -583,12 +503,10 @@ impl PageCache {
    /// ```
    ///
    async fn lock_for_read(
-        &'static self,
+        &self,
        cache_key: &mut CacheKey,
        ctx: &RequestContext,
    ) -> anyhow::Result<ReadBufResult> {
-        let mut permit = Some(self.try_get_pinned_slot_permit().await?);
-
        let (read_access, hit) = match cache_key {
            CacheKey::MaterializedPage { .. } => {
                unreachable!("Materialized pages use lookup_materialized_page")
@@ -605,21 +523,17 @@ impl PageCache {
        let mut is_first_iteration = true;
        loop {
            // First check if the key already exists in the cache.
-            if let Some(read_guard) = self.try_lock_for_read(cache_key, &mut permit).await {
-                debug_assert!(permit.is_none());
+            if let Some(read_guard) = self.try_lock_for_read(cache_key).await {
                if is_first_iteration {
                    hit.inc();
                }
                return Ok(ReadBufResult::Found(read_guard));
            }
-            debug_assert!(permit.is_some());
            is_first_iteration = false;

            // Not found. Find a victim buffer
-            let (slot_idx, mut inner) = self
-                .find_victim(permit.as_ref().unwrap())
-                .await
-                .context("Failed to find evict victim")?;
+            let (slot_idx, mut inner) =
+                self.find_victim().context("Failed to find evict victim")?;

            // Insert mapping for this. At this point, we may find that another
            // thread did the same thing concurrently. In that case, we evicted
@@ -641,41 +555,27 @@ impl PageCache {
            inner.key = Some(cache_key.clone());
            slot.set_usage_count(1);

-            debug_assert!(
-                {
-                    let guard = inner.permit.lock().unwrap();
-                    guard.upgrade().is_none()
-                },
-                "we hold a write lock, so, no one else should have a permit"
-            );
-
            return Ok(ReadBufResult::NotFound(PageWriteGuard {
-                state: PageWriteGuardState::Invalid {
-                    _permit: permit.take().unwrap(),
-                    inner,
-                },
+                inner,
+                valid: false,
            }));
        }
    }

-    // FIXME: the name is wrong.
-    async fn try_lock_for_write(
-        &self,
-        cache_key: &CacheKey,
-        permit: &mut Option<PinnedSlotsPermit>,
-    ) -> Option<PageReadGuard> {
+    /// Look up a page in the cache and lock it in write mode. If it's not
+    /// found, returns None.
+    ///
+    /// When locking a page for writing, the search criteria is always "exact".
+    async fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
        if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
            // The page was found in the mapping. Lock the slot, and re-check
            // that it's still what we expected (because we don't released the mapping
            // lock already, another thread could have evicted the page)
            let slot = &self.slots[slot_idx];
-            let inner = slot.inner.read().await;
+            let inner = slot.inner.write().await;
            if inner.key.as_ref() == Some(cache_key) {
                slot.inc_usage_count();
-                return Some(PageReadGuard {
-                    _permit: inner.coalesce_readers_permit(permit.take().unwrap()),
-                    slot_guard: inner,
-                });
+                return Some(PageWriteGuard { inner, valid: true });
            }
        }
        None
@@ -685,21 +585,16 @@ impl PageCache {
    ///
    /// Similar to lock_for_read(), but the returned buffer is write-locked and
    /// may be modified by the caller even if it's already found in the cache.
-    async fn lock_for_write(&'static self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
-        let mut permit = Some(self.try_get_pinned_slot_permit().await?);
+    async fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
        loop {
            // First check if the key already exists in the cache.
-            if let Some(write_guard) = self.try_lock_for_write(cache_key, &mut permit).await {
-                debug_assert!(permit.is_none());
+            if let Some(write_guard) = self.try_lock_for_write(cache_key).await {
                return Ok(WriteBufResult::Found(write_guard));
            }
-            debug_assert!(permit.is_some());

            // Not found. Find a victim buffer
-            let (slot_idx, mut inner) = self
-                .find_victim(permit.as_ref().unwrap())
-                .await
-                .context("Failed to find evict victim")?;
+            let (slot_idx, mut inner) =
+                self.find_victim().context("Failed to find evict victim")?;

            // Insert mapping for this. At this point, we may find that another
            // thread did the same thing concurrently. In that case, we evicted
@@ -721,19 +616,9 @@ impl PageCache {
            inner.key = Some(cache_key.clone());
            slot.set_usage_count(1);

-            debug_assert!(
-                {
-                    let guard = inner.permit.lock().unwrap();
-                    guard.upgrade().is_none()
-                },
-                "we hold a write lock, so, no one else should have a permit"
-            );
-
            return Ok(WriteBufResult::NotFound(PageWriteGuard {
-                state: PageWriteGuardState::Invalid {
-                    _permit: permit.take().unwrap(),
-                    inner,
-                },
+                inner,
+                valid: false,
            }));
        }
    }
@@ -884,21 +769,8 @@ impl PageCache {
    /// Find a slot to evict.
    ///
    /// On return, the slot is empty and write-locked.
-    async fn find_victim(
-        &'static self,
-        _permit_witness: &PinnedSlotsPermit,
-    ) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
-        // Get in line.
-        let mut receiver = self.find_victim_waiters.recv();
-        // If we get cancelled at the receiver.await below, the victim slot
-        // remains in the channel. Consume these first before going into
-        // the loop below.
-        match futures::poll!(&mut receiver) {
-            Poll::Ready(Ok(res)) => return Ok(res),
-            Poll::Ready(Err(_closed)) => unreachable!("we never close the channel"),
-            Poll::Pending => {} // the regular case where we aren't cancelled below
-        };
-
+    fn find_victim(&self) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
+        let iter_limit = self.slots.len() * 10;
        let mut iters = 0;
        loop {
            iters += 1;
@@ -910,8 +782,14 @@ impl PageCache {
                let mut inner = match slot.inner.try_write() {
                    Ok(inner) => inner,
                    Err(_err) => {
-                        if iters > self.slots.len() * (MAX_USAGE_COUNT as usize) {
-                            unreachable!("find_victim_waiters prevents starvation");
+                        // If we have looped through the whole buffer pool 10 times
+                        // and still haven't found a victim buffer, something's wrong.
+                        // Maybe all the buffers were in locked. That could happen in
+                        // theory, if you have more threads holding buffers locked than
+                        // there are buffers in the pool. In practice, with a reasonably
+                        // large buffer pool it really shouldn't happen.
+                        if iters > iter_limit {
+                            anyhow::bail!("exceeded evict iter limit");
                        }
                        continue;
                    }
@@ -921,11 +799,7 @@ impl PageCache {
                    self.remove_mapping(old_key);
                    inner.key = None;
                }
-                crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
-                self.find_victim_sender
-                    .try_send((slot_idx, inner))
-                    .expect("we always get in line first");
-                return Ok(receiver.await.unwrap());
+                return Ok((slot_idx, inner));
            }
        }
    }
@@ -952,26 +826,18 @@ impl PageCache {
                let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();

                Slot {
-                    inner: tokio::sync::RwLock::new(SlotInner {
-                        key: None,
-                        buf,
-                        permit: std::sync::Mutex::new(Weak::new()),
-                    }),
+                    inner: tokio::sync::RwLock::new(SlotInner { key: None, buf }),
                    usage_count: AtomicU8::new(0),
                }
            })
            .collect();

-        let (find_victim_sender, find_victim_waiters) = async_channel::bounded(num_pages);
        Self {
            materialized_page_map: Default::default(),
            immutable_page_map: Default::default(),
            slots,
            next_evict_slot: AtomicUsize::new(0),
            size_metrics,
-            pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
-            find_victim_sender,
-            find_victim_waiters,
        }
    }
 }
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -37,7 +37,7 @@ impl Key {
            | self.field6 as i128
    }

-    pub const fn from_i128(x: i128) -> Self {
+    pub fn from_i128(x: i128) -> Self {
        Key {
            field1: ((x >> 120) & 0xf) as u8,
            field2: ((x >> 104) & 0xFFFF) as u32,
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -293,8 +293,6 @@ pub enum TaskKind {

    DebugTool,

-    BackgroundRuntimeTurnaroundMeasure,
-
    #[cfg(test)]
    UnitTest,
 }
@@ -458,7 +456,7 @@ async fn task_finish(
    }

    if shutdown_process {
-        shutdown_pageserver(None, 1).await;
+        shutdown_pageserver(1).await;
    }
 }

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -57,7 +57,6 @@ use self::timeline::EvictionTaskTenantState;
 use self::timeline::TimelineResources;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::deletion_queue::DeletionQueueClient;
 use crate::import_datadir;
 use crate::is_uninit_mark;
 use crate::metrics::TENANT_ACTIVATION;
@@ -118,7 +117,7 @@ mod span;

 pub mod metadata;
 mod par_fsync;
-pub mod remote_timeline_client;
+mod remote_timeline_client;
 pub mod storage_layer;

 pub mod config;
@@ -158,7 +157,6 @@ pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
 pub struct TenantSharedResources {
    pub broker_client: storage_broker::BrokerClientChannel,
    pub remote_storage: Option<GenericRemoteStorage>,
-    pub deletion_queue_client: DeletionQueueClient,
 }

 ///
@@ -199,9 +197,6 @@ pub struct Tenant {
    // provides access to timeline data sitting in the remote storage
    pub(crate) remote_storage: Option<GenericRemoteStorage>,

-    // Access to global deletion queue for when this tenant wants to schedule a deletion
-    deletion_queue_client: DeletionQueueClient,
-
    /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
    cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
    cached_synthetic_tenant_size: Arc<AtomicU64>,
@@ -528,20 +523,15 @@ impl Tenant {
        conf: &'static PageServerConf,
        tenant_id: TenantId,
        generation: Generation,
-        resources: TenantSharedResources,
+        broker_client: storage_broker::BrokerClientChannel,
        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        remote_storage: GenericRemoteStorage,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Tenant>> {
        // TODO dedup with spawn_load
        let tenant_conf =
            Self::load_tenant_config(conf, &tenant_id).context("load tenant config")?;

-        let TenantSharedResources {
-            broker_client,
-            remote_storage,
-            deletion_queue_client,
-        } = resources;
-
        let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
        let tenant = Arc::new(Tenant::new(
            TenantState::Attaching,
@@ -550,8 +540,7 @@ impl Tenant {
            wal_redo_manager,
            tenant_id,
            generation,
-            remote_storage.clone(),
-            deletion_queue_client,
+            Some(remote_storage.clone()),
        ));

        // Do all the hard work in the background
@@ -582,7 +571,7 @@ impl Tenant {
                let pending_deletion = {
                    match DeleteTenantFlow::should_resume_deletion(
                        conf,
-                        remote_storage.as_ref(),
+                        Some(&remote_storage),
                        &tenant_clone,
                    )
                    .await
@@ -671,7 +660,6 @@ impl Tenant {
        for timeline_id in remote_timeline_ids {
            let client = RemoteTimelineClient::new(
                remote_storage.clone(),
-                self.deletion_queue_client.clone(),
                self.conf,
                self.tenant_id,
                timeline_id,
@@ -738,7 +726,6 @@ impl Tenant {
                remote_metadata,
                TimelineResources {
                    remote_client: Some(remote_client),
-                    deletion_queue_client: self.deletion_queue_client.clone(),
                },
                ctx,
            )
@@ -763,7 +750,6 @@ impl Tenant {
                timeline_id,
                &index_part.metadata,
                Some(remote_timeline_client),
-                self.deletion_queue_client.clone(),
                None,
            )
            .await
@@ -865,7 +851,6 @@ impl Tenant {
            tenant_id,
            Generation::broken(),
            None,
-            DeletionQueueClient::broken(),
        ))
    }

@@ -910,7 +895,6 @@ impl Tenant {
            tenant_id,
            generation,
            remote_storage.clone(),
-            resources.deletion_queue_client.clone(),
        );
        let tenant = Arc::new(tenant);

@@ -1318,7 +1302,6 @@ impl Tenant {
                                timeline_id,
                                &local_metadata,
                                Some(remote_client),
-                                self.deletion_queue_client.clone(),
                                init_order,
                            )
                            .await
@@ -1368,7 +1351,6 @@ impl Tenant {
                        timeline_id,
                        &local_metadata,
                        None,
-                        self.deletion_queue_client.clone(),
                        init_order,
                    )
                    .await
@@ -2260,9 +2242,6 @@ impl Tenant {
        Ok(timeline)
    }

-    // Allow too_many_arguments because a constructor's argument list naturally grows with the
-    // number of attributes in the struct: breaking these out into a builder wouldn't be helpful.
-    #[allow(clippy::too_many_arguments)]
    fn new(
        state: TenantState,
        conf: &'static PageServerConf,
@@ -2271,7 +2250,6 @@ impl Tenant {
        tenant_id: TenantId,
        generation: Generation,
        remote_storage: Option<GenericRemoteStorage>,
-        deletion_queue_client: DeletionQueueClient,
    ) -> Tenant {
        let (state, mut rx) = watch::channel(state);

@@ -2339,7 +2317,6 @@ impl Tenant {
            gc_cs: tokio::sync::Mutex::new(()),
            walredo_mgr,
            remote_storage,
-            deletion_queue_client,
            state,
            cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
            cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
@@ -2879,7 +2856,6 @@ impl Tenant {
        let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
            let remote_client = RemoteTimelineClient::new(
                remote_storage.clone(),
-                self.deletion_queue_client.clone(),
                self.conf,
                self.tenant_id,
                timeline_id,
@@ -2890,10 +2866,7 @@ impl Tenant {
            None
        };

-        TimelineResources {
-            remote_client,
-            deletion_queue_client: self.deletion_queue_client.clone(),
-        }
+        TimelineResources { remote_client }
    }

    /// Creates intermediate timeline structure and its files.
@@ -3349,7 +3322,6 @@ pub mod harness {
    use utils::logging;
    use utils::lsn::Lsn;

-    use crate::deletion_queue::mock::MockDeletionQueue;
    use crate::{
        config::PageServerConf,
        repository::Key,
@@ -3411,7 +3383,6 @@ pub mod harness {
        pub generation: Generation,
        pub remote_storage: GenericRemoteStorage,
        pub remote_fs_dir: PathBuf,
-        pub deletion_queue: MockDeletionQueue,
    }

    static LOG_HANDLE: OnceCell<()> = OnceCell::new();
@@ -3460,7 +3431,6 @@ pub mod harness {
                storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
            };
            let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
-            let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));

            Ok(Self {
                conf,
@@ -3469,7 +3439,6 @@ pub mod harness {
                generation: Generation::new(0xdeadbeef),
                remote_storage,
                remote_fs_dir,
-                deletion_queue,
            })
        }

@@ -3494,7 +3463,6 @@ pub mod harness {
                self.tenant_id,
                self.generation,
                Some(self.remote_storage.clone()),
-                self.deletion_queue.new_client(),
            ));
            tenant
                .load(None, ctx)
@@ -4225,8 +4193,7 @@ mod tests {
    //
    #[tokio::test]
    async fn test_bulk_insert() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_bulk_insert")?;
-        let (tenant, ctx) = harness.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_bulk_insert")?.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;
@@ -4273,8 +4240,7 @@ mod tests {

    #[tokio::test]
    async fn test_random_updates() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_random_updates")?;
-        let (tenant, ctx) = harness.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_random_updates")?.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -186,22 +186,27 @@ impl FileBlockReader {
        ctx: &RequestContext,
    ) -> Result<BlockLease, std::io::Error> {
        let cache = page_cache::get();
-        match cache
-            .read_immutable_buf(self.file_id, blknum, ctx)
-            .await
-            .map_err(|e| {
-                std::io::Error::new(
-                    std::io::ErrorKind::Other,
-                    format!("Failed to read immutable buf: {e:#}"),
-                )
-            })? {
-            ReadBufResult::Found(guard) => return Ok(guard.into()),
-            ReadBufResult::NotFound(mut write_guard) => {
-                // Read the page from disk into the buffer
-                self.fill_buffer(write_guard.deref_mut(), blknum).await?;
-                return Ok(write_guard.mark_valid().into());
-            }
-        };
+        loop {
+            match cache
+                .read_immutable_buf(self.file_id, blknum, ctx)
+                .await
+                .map_err(|e| {
+                    std::io::Error::new(
+                        std::io::ErrorKind::Other,
+                        format!("Failed to read immutable buf: {e:#}"),
+                    )
+                })? {
+                ReadBufResult::Found(guard) => break Ok(guard.into()),
+                ReadBufResult::NotFound(mut write_guard) => {
+                    // Read the page from disk into the buffer
+                    self.fill_buffer(write_guard.deref_mut(), blknum).await?;
+                    write_guard.mark_valid();
+
+                    // Swap for read lock
+                    continue;
+                }
+            };
+        }
    }
 }

--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -70,34 +70,38 @@ impl EphemeralFile {
        let flushed_blknums = 0..self.len / PAGE_SZ as u64;
        if flushed_blknums.contains(&(blknum as u64)) {
            let cache = page_cache::get();
-            match cache
-                .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
-                .await
-                .map_err(|e| {
-                    std::io::Error::new(
-                        std::io::ErrorKind::Other,
-                        // order path before error because error is anyhow::Error => might have many contexts
-                        format!(
-                            "ephemeral file: read immutable page #{}: {}: {:#}",
-                            blknum,
-                            self.file.path.display(),
-                            e,
-                        ),
-                    )
-                })? {
-                page_cache::ReadBufResult::Found(guard) => {
-                    return Ok(BlockLease::PageReadGuard(guard))
-                }
-                page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                    let buf: &mut [u8] = write_guard.deref_mut();
-                    debug_assert_eq!(buf.len(), PAGE_SZ);
-                    self.file
-                        .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
-                        .await?;
-                    let read_guard = write_guard.mark_valid();
-                    return Ok(BlockLease::PageReadGuard(read_guard));
-                }
-            };
+            loop {
+                match cache
+                    .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
+                    .await
+                    .map_err(|e| {
+                        std::io::Error::new(
+                            std::io::ErrorKind::Other,
+                            // order path before error because error is anyhow::Error => might have many contexts
+                            format!(
+                                "ephemeral file: read immutable page #{}: {}: {:#}",
+                                blknum,
+                                self.file.path.display(),
+                                e,
+                            ),
+                        )
+                    })? {
+                    page_cache::ReadBufResult::Found(guard) => {
+                        return Ok(BlockLease::PageReadGuard(guard))
+                    }
+                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
+                        let buf: &mut [u8] = write_guard.deref_mut();
+                        debug_assert_eq!(buf.len(), PAGE_SZ);
+                        self.file
+                            .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
+                            .await?;
+                        write_guard.mark_valid();
+
+                        // Swap for read lock
+                        continue;
+                    }
+                };
+            }
        } else {
            debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
            Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
@@ -167,7 +171,7 @@ impl EphemeralFile {
                                        let buf: &mut [u8] = write_guard.deref_mut();
                                        debug_assert_eq!(buf.len(), PAGE_SZ);
                                        buf.copy_from_slice(&self.ephemeral_file.mutable_tail);
-                                        let _ = write_guard.mark_valid();
+                                        write_guard.mark_valid();
                                        // pre-warm successful
                                    }
                                    Err(e) => {
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -20,10 +20,7 @@ use utils::crashsafe;

 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::control_plane_client::{
-    ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
-};
-use crate::deletion_queue::DeletionQueueClient;
+use crate::control_plane_client::ControlPlaneClient;
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::delete::DeleteTenantFlow;
@@ -119,28 +116,7 @@ pub async fn init_tenant_mgr(

    // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
    let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf, &cancel) {
-        let result = match client.re_attach().await {
-            Ok(tenants) => tenants,
-            Err(RetryForeverError::ShuttingDown) => {
-                anyhow::bail!("Shut down while waiting for control plane re-attach response")
-            }
-        };
-
-        // The deletion queue needs to know about the startup attachment state to decide which (if any) stored
-        // deletion list entries may still be valid.  We provide that by pushing a recovery operation into
-        // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
-        // are processed, even though we don't block on recovery completing here.
-        //
-        // Must only do this if remote storage is enabled, otherwise deletion queue
-        // is not running and channel push will fail.
-        if resources.remote_storage.is_some() {
-            resources
-                .deletion_queue_client
-                .recover(result.clone())
-                .await?;
-        }
-
-        Some(result)
+        Some(client.re_attach().await?)
    } else {
        info!("Control plane API not configured, tenant generations are disabled");
        None
@@ -309,21 +285,29 @@ pub(crate) fn schedule_local_tenant_processing(

    let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
        info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
-        if resources.remote_storage.is_none() {
-            warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
-            Tenant::create_broken_tenant(
+        if let Some(remote_storage) = resources.remote_storage {
+            match Tenant::spawn_attach(
                conf,
                tenant_id,
-                "attaching mark file present but no remote storage configured".to_string(),
-            )
-        } else {
-            match Tenant::spawn_attach(conf, tenant_id, generation, resources, tenants, ctx) {
+                generation,
+                resources.broker_client,
+                tenants,
+                remote_storage,
+                ctx,
+            ) {
                Ok(tenant) => tenant,
                Err(e) => {
                    error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
                    Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
                }
            }
+        } else {
+            warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
+            Tenant::create_broken_tenant(
+                conf,
+                tenant_id,
+                "attaching mark file present but no remote storage configured".to_string(),
+            )
        }
    } else {
        info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
@@ -454,7 +438,8 @@ pub async fn create_tenant(
    tenant_conf: TenantConfOpt,
    tenant_id: TenantId,
    generation: Generation,
-    resources: TenantSharedResources,
+    broker_client: storage_broker::BrokerClientChannel,
+    remote_storage: Option<GenericRemoteStorage>,
    ctx: &RequestContext,
 ) -> Result<Arc<Tenant>, TenantMapInsertError> {
    tenant_map_insert(tenant_id, || async {
@@ -465,9 +450,13 @@ pub async fn create_tenant(
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233

+        let tenant_resources = TenantSharedResources {
+            broker_client,
+            remote_storage,
+        };
        let created_tenant =
            schedule_local_tenant_processing(conf, tenant_id, &tenant_directory,
-                generation, resources, None, &TENANTS, ctx)?;
+                generation, tenant_resources, None, &TENANTS, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

@@ -633,7 +622,6 @@ pub async fn load_tenant(
    generation: Generation,
    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
-    deletion_queue_client: DeletionQueueClient,
    ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
    tenant_map_insert(tenant_id, || async {
@@ -647,7 +635,6 @@ pub async fn load_tenant(
        let resources = TenantSharedResources {
            broker_client,
            remote_storage,
-            deletion_queue_client
        };
        let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, generation, resources, None,  &TENANTS, ctx)
            .with_context(|| {
@@ -715,7 +702,8 @@ pub async fn attach_tenant(
    tenant_id: TenantId,
    generation: Generation,
    tenant_conf: TenantConfOpt,
-    resources: TenantSharedResources,
+    broker_client: storage_broker::BrokerClientChannel,
+    remote_storage: GenericRemoteStorage,
    ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
    tenant_map_insert(tenant_id, || async {
@@ -730,7 +718,10 @@ pub async fn attach_tenant(
            .context("check for attach marker file existence")?;
        anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");

-
+        let resources = TenantSharedResources {
+            broker_client,
+            remote_storage: Some(remote_storage),
+        };
        let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, generation, resources, None, &TENANTS, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -116,12 +116,8 @@
 //! # Completion
 //!
 //! Once an operation has completed, we update
-//! [`UploadQueueInitialized::projected_remote_consistent_lsn`] immediately,
-//! and submit a request through the DeletionQueue to update
-//! [`UploadQueueInitialized::visible_remote_consistent_lsn`] after it has
-//! validated that our generation is not stale.  It is this visible value
-//! that is advertized to safekeepers as a signal that that they can
-//! delete the WAL up to that LSN.
+//! [`UploadQueueInitialized::last_uploaded_consistent_lsn`] which indicates
+//! to safekeepers that they can delete the WAL up to that LSN.
 //!
 //! The [`RemoteTimelineClient::wait_completion`] method can be used to wait
 //! for all pending operations to complete. It does not prevent more
@@ -204,6 +200,7 @@
 //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
 //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map

+mod delete;
 mod download;
 pub mod index;
 mod upload;
@@ -229,7 +226,6 @@ use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;

-use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{
    MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
    RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
@@ -328,8 +324,6 @@ pub struct RemoteTimelineClient {
    metrics: Arc<RemoteTimelineClientMetrics>,

    storage_impl: GenericRemoteStorage,
-
-    deletion_queue_client: DeletionQueueClient,
 }

 impl RemoteTimelineClient {
@@ -341,7 +335,6 @@ impl RemoteTimelineClient {
    ///
    pub fn new(
        remote_storage: GenericRemoteStorage,
-        deletion_queue_client: DeletionQueueClient,
        conf: &'static PageServerConf,
        tenant_id: TenantId,
        timeline_id: TimelineId,
@@ -359,7 +352,6 @@ impl RemoteTimelineClient {
            timeline_id,
            generation,
            storage_impl: remote_storage,
-            deletion_queue_client,
            upload_queue: Mutex::new(UploadQueue::Uninitialized),
            metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
        }
@@ -421,24 +413,13 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    pub fn remote_consistent_lsn_projected(&self) -> Option<Lsn> {
-        match &mut *self.upload_queue.lock().unwrap() {
+    pub fn last_uploaded_consistent_lsn(&self) -> Option<Lsn> {
+        match &*self.upload_queue.lock().unwrap() {
            UploadQueue::Uninitialized => None,
-            UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(),
-            UploadQueue::Stopped(q) => q
-                .upload_queue_for_deletion
-                .get_last_remote_consistent_lsn_projected(),
-        }
-    }
-
-    pub fn remote_consistent_lsn_visible(&self) -> Option<Lsn> {
-        match &mut *self.upload_queue.lock().unwrap() {
-            UploadQueue::Uninitialized => None,
-            UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()),
-            UploadQueue::Stopped(q) => Some(
-                q.upload_queue_for_deletion
-                    .get_last_remote_consistent_lsn_visible(),
-            ),
+            UploadQueue::Initialized(q) => Some(q.last_uploaded_consistent_lsn),
+            UploadQueue::Stopped(q) => {
+                Some(q.upload_queue_for_deletion.last_uploaded_consistent_lsn)
+            }
        }
    }

@@ -453,11 +434,11 @@ impl RemoteTimelineClient {
        } else {
            0
        };
-        self.metrics.remote_physical_size_set(size);
+        self.metrics.remote_physical_size_gauge().set(size);
    }

    pub fn get_remote_physical_size(&self) -> u64 {
-        self.metrics.remote_physical_size_get()
+        self.metrics.remote_physical_size_gauge().get()
    }

    //
@@ -662,7 +643,7 @@ impl RemoteTimelineClient {
    /// successfully.
    pub fn schedule_layer_file_deletion(
        self: &Arc<Self>,
-        names: Vec<LayerFileName>,
+        names: &[LayerFileName],
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;
@@ -682,10 +663,10 @@ impl RemoteTimelineClient {
            // Decorate our list of names with each name's generation, dropping
            // makes that are unexpectedly missing from our metadata.
            let with_generations: Vec<_> = names
-                .into_iter()
+                .iter()
                .filter_map(|name| {
                    // Remove from latest_files, learning the file's remote generation in the process
-                    let meta = upload_queue.latest_files.remove(&name);
+                    let meta = upload_queue.latest_files.remove(name);

                    if let Some(meta) = meta {
                        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
@@ -707,16 +688,18 @@ impl RemoteTimelineClient {
                self.schedule_index_upload(upload_queue, metadata);
            }

-            for (name, gen) in &with_generations {
-                info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
-            }
-
            // schedule the actual deletions
-            let op = UploadOp::Delete(Delete {
-                layers: with_generations,
-            });
-            self.calls_unfinished_metric_begin(&op);
-            upload_queue.queued_operations.push_back(op);
+            for (name, generation) in with_generations {
+                let op = UploadOp::Delete(Delete {
+                    file_kind: RemoteOpFileKind::Layer,
+                    layer_file_name: name.clone(),
+                    scheduled_from_timeline_delete: false,
+                    generation,
+                });
+                self.calls_unfinished_metric_begin(&op);
+                upload_queue.queued_operations.push_back(op);
+                info!("scheduled layer file deletion {name}");
+            }

            // Launch the tasks immediately, if possible
            self.launch_queued_tasks(upload_queue);
@@ -850,7 +833,9 @@ impl RemoteTimelineClient {
    pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
        debug_assert_current_span_has_tenant_and_timeline_id();

-        let layers: Vec<RemotePath> = {
+        let (mut receiver, deletions_queued) = {
+            let mut deletions_queued = 0;
+
            let mut locked = self.upload_queue.lock().unwrap();
            let stopped = locked.stopped_mut()?;

@@ -862,30 +847,42 @@ impl RemoteTimelineClient {

            stopped
                .upload_queue_for_deletion
-                .latest_files
-                .drain()
-                .map(|(file_name, meta)| {
-                    remote_layer_path(
-                        &self.tenant_id,
-                        &self.timeline_id,
-                        &file_name,
-                        meta.generation,
-                    )
-                })
-                .collect()
+                .queued_operations
+                .reserve(stopped.upload_queue_for_deletion.latest_files.len());
+
+            // schedule the actual deletions
+            for (name, meta) in &stopped.upload_queue_for_deletion.latest_files {
+                let op = UploadOp::Delete(Delete {
+                    file_kind: RemoteOpFileKind::Layer,
+                    layer_file_name: name.clone(),
+                    scheduled_from_timeline_delete: true,
+                    generation: meta.generation,
+                });
+
+                self.calls_unfinished_metric_begin(&op);
+                stopped
+                    .upload_queue_for_deletion
+                    .queued_operations
+                    .push_back(op);
+
+                info!("scheduled layer file deletion {name}");
+                deletions_queued += 1;
+            }
+
+            self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion);
+
+            (
+                self.schedule_barrier(&mut stopped.upload_queue_for_deletion),
+                deletions_queued,
+            )
        };

-        let layer_deletion_count = layers.len();
-        self.deletion_queue_client.push_immediate(layers).await?;
+        receiver.changed().await.context("upload queue shut down")?;

        // Do not delete index part yet, it is needed for possible retry. If we remove it first
        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
        let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id);

-        // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
-        // taking the burden of listing all the layers that we already know we should delete.
-        self.deletion_queue_client.flush_immediate().await?;
-
        let remaining = backoff::retry(
            || async {
                self.storage_impl
@@ -913,9 +910,17 @@ impl RemoteTimelineClient {
            })
            .collect();

-        let not_referenced_count = remaining.len();
        if !remaining.is_empty() {
-            self.deletion_queue_client.push_immediate(remaining).await?;
+            backoff::retry(
+                || async { self.storage_impl.delete_objects(&remaining).await },
+                |_e| false,
+                FAILED_UPLOAD_WARN_THRESHOLD,
+                FAILED_REMOTE_OP_RETRIES,
+                "delete_objects",
+                backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
+            )
+            .await
+            .context("delete_objects")?;
        }

        fail::fail_point!("timeline-delete-before-index-delete", |_| {
@@ -926,14 +931,18 @@ impl RemoteTimelineClient {

        let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));

-        debug!("enqueuing index part deletion");
-        self.deletion_queue_client
-            .push_immediate([index_file_path].to_vec())
-            .await?;
+        debug!("deleting index part");

-        // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
-        // for a flush to a persistent deletion list so that we may be sure deletion will occur.
-        self.deletion_queue_client.flush_immediate().await?;
+        backoff::retry(
+            || async { self.storage_impl.delete(&index_file_path).await },
+            |_e| false,
+            FAILED_UPLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "delete_index",
+            backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled")),
+        )
+        .await
+        .context("delete_index")?;

        fail::fail_point!("timeline-delete-after-index-delete", |_| {
            Err(anyhow::anyhow!(
@@ -941,7 +950,7 @@ impl RemoteTimelineClient {
            ))?
        });

-        info!(prefix=%timeline_storage_path, referenced=layer_deletion_count, not_referenced=%not_referenced_count, "done deleting in timeline prefix, including index_part.json");
+        info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");

        Ok(())
    }
@@ -1131,16 +1140,21 @@ impl RemoteTimelineClient {
                    }
                    res
                }
-                UploadOp::Delete(delete) => self
-                    .deletion_queue_client
-                    .push_layers(
-                        self.tenant_id,
-                        self.timeline_id,
-                        self.generation,
-                        delete.layers.clone(),
-                    )
-                    .await
-                    .map_err(|e| anyhow::anyhow!(e)),
+                UploadOp::Delete(delete) => {
+                    let path = &self
+                        .conf
+                        .timeline_path(&self.tenant_id, &self.timeline_id)
+                        .join(delete.layer_file_name.file_name());
+                    delete::delete_layer(self.conf, &self.storage_impl, path, delete.generation)
+                        .measure_remote_op(
+                            self.tenant_id,
+                            self.timeline_id,
+                            delete.file_kind,
+                            RemoteOpKind::Delete,
+                            Arc::clone(&self.metrics),
+                        )
+                        .await
+                }
                UploadOp::Barrier(_) => {
                    // unreachable. Barrier operations are handled synchronously in
                    // launch_queued_tasks
@@ -1196,12 +1210,18 @@ impl RemoteTimelineClient {
        }

        // The task has completed successfully. Remove it from the in-progress list.
-        let lsn_update = {
+        {
            let mut upload_queue_guard = self.upload_queue.lock().unwrap();
            let upload_queue = match upload_queue_guard.deref_mut() {
                UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"),
-                UploadQueue::Stopped(_stopped) => {
-                    None
+                UploadQueue::Stopped(stopped) => {
+                    // Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion)
+                    // then stop() took care of it so we just return.
+                    // For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc.
+                    match &task.op {
+                        UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion),
+                        _ => None
+                    }
                },
                UploadQueue::Initialized(qi) => { Some(qi) }
            };
@@ -1216,51 +1236,23 @@ impl RemoteTimelineClient {

            upload_queue.inprogress_tasks.remove(&task.task_id);

-            let lsn_update = match task.op {
+            match task.op {
                UploadOp::UploadLayer(_, _) => {
                    upload_queue.num_inprogress_layer_uploads -= 1;
-                    None
                }
                UploadOp::UploadMetadata(_, lsn) => {
                    upload_queue.num_inprogress_metadata_uploads -= 1;
-                    // XXX monotonicity check?
-
-                    upload_queue.projected_remote_consistent_lsn = Some(lsn);
-                    if self.generation.is_none() {
-                        // Legacy mode: skip validating generation
-                        upload_queue.visible_remote_consistent_lsn.store(lsn);
-                        None
-                    } else {
-                        Some((lsn, upload_queue.visible_remote_consistent_lsn.clone()))
-                    }
+                    upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check?
                }
                UploadOp::Delete(_) => {
                    upload_queue.num_inprogress_deletions -= 1;
-                    None
                }
                UploadOp::Barrier(_) => unreachable!(),
            };

            // Launch any queued tasks that were unblocked by this one.
            self.launch_queued_tasks(upload_queue);
-            lsn_update
-        };
-
-        if let Some((lsn, slot)) = lsn_update {
-            // Updates to the remote_consistent_lsn we advertise to pageservers
-            // are all routed through the DeletionQueue, to enforce important
-            // data safety guarantees (see docs/rfcs/025-generation-numbers.md)
-            self.deletion_queue_client
-                .update_remote_consistent_lsn(
-                    self.tenant_id,
-                    self.timeline_id,
-                    self.generation,
-                    lsn,
-                    slot,
-                )
-                .await;
        }
-
        self.calls_unfinished_metric_end(&task.op);
    }

@@ -1286,8 +1278,8 @@ impl RemoteTimelineClient {
                    reason: "metadata uploads are tiny",
                },
            ),
-            UploadOp::Delete(_delete) => (
-                RemoteOpFileKind::Layer,
+            UploadOp::Delete(delete) => (
+                delete.file_kind,
                RemoteOpKind::Delete,
                DontTrackSize {
                    reason: "should we track deletes? positive or negative sign?",
@@ -1349,10 +1341,7 @@ impl RemoteTimelineClient {
                        latest_files: initialized.latest_files.clone(),
                        latest_files_changes_since_metadata_upload_scheduled: 0,
                        latest_metadata: initialized.latest_metadata.clone(),
-                        projected_remote_consistent_lsn: None,
-                        visible_remote_consistent_lsn: initialized
-                            .visible_remote_consistent_lsn
-                            .clone(),
+                        last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn,
                        num_inprogress_layer_uploads: 0,
                        num_inprogress_metadata_uploads: 0,
                        num_inprogress_deletions: 0,
@@ -1416,13 +1405,13 @@ pub fn remote_layer_path(
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
    layer_file_name: &LayerFileName,
-    generation: Generation,
+    layer_meta: &LayerFileMetadata,
 ) -> RemotePath {
    // Generation-aware key format
    let path = format!(
        "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
        layer_file_name.file_name(),
-        generation.get_suffix()
+        layer_meta.generation.get_suffix()
    );

    RemotePath::from_string(&path).expect("Failed to construct path")
@@ -1565,6 +1554,7 @@ mod tests {

    impl TestSetup {
        async fn new(test_name: &str) -> anyhow::Result<Self> {
+            // Use a current-thread runtime in the test
            let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
            let harness = TenantHarness::create(test_name)?;
            let (tenant, ctx) = harness.load().await;
@@ -1590,7 +1580,6 @@ mod tests {
                timeline_id: TIMELINE_ID,
                generation,
                storage_impl: self.harness.remote_storage.clone(),
-                deletion_queue_client: self.harness.deletion_queue.new_client(),
                upload_queue: Mutex::new(UploadQueue::Uninitialized),
                metrics: Arc::new(RemoteTimelineClientMetrics::new(
                    &self.harness.tenant_id,
@@ -1760,7 +1749,7 @@ mod tests {
            )
            .unwrap();
        client
-            .schedule_layer_file_deletion([layer_file_name_1.clone()].to_vec())
+            .schedule_layer_file_deletion(&[layer_file_name_1.clone()])
            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
@@ -1786,7 +1775,6 @@ mod tests {

        // Finish them
        client.wait_completion().await.unwrap();
-        harness.deletion_queue.pump().await;

        assert_remote_files(
            &[
--- a/pageserver/src/tenant/remote_timeline_client/delete.rs
+++ b/pageserver/src/tenant/remote_timeline_client/delete.rs
@@ -0,0 +1,34 @@
+//! Helper functions to delete files from remote storage with a RemoteStorage
+use anyhow::Context;
+use std::path::Path;
+use tracing::debug;
+
+use remote_storage::GenericRemoteStorage;
+
+use crate::{
+    config::PageServerConf,
+    tenant::{remote_timeline_client::remote_path, Generation},
+};
+
+pub(super) async fn delete_layer<'a>(
+    conf: &'static PageServerConf,
+    storage: &'a GenericRemoteStorage,
+    local_layer_path: &'a Path,
+    generation: Generation,
+) -> anyhow::Result<()> {
+    fail::fail_point!("before-delete-layer", |_| {
+        anyhow::bail!("failpoint before-delete-layer")
+    });
+    debug!("Deleting layer from remote storage: {local_layer_path:?}",);
+
+    let path_to_delete = remote_path(conf, local_layer_path, generation)?;
+
+    // We don't want to print an error if the delete failed if the file has
+    // already been deleted. Thankfully, in this situation S3 already
+    // does not yield an error. While OS-provided local file system APIs do yield
+    // errors, we avoid them in the `LocalFs` wrapper.
+    storage
+        .delete(&path_to_delete)
+        .await
+        .with_context(|| format!("delete remote layer from storage at {path_to_delete:?}"))
+}
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -50,12 +50,7 @@ pub async fn download_layer_file<'a>(
        .timeline_path(&tenant_id, &timeline_id)
        .join(layer_file_name.file_name());

-    let remote_path = remote_layer_path(
-        &tenant_id,
-        &timeline_id,
-        layer_file_name,
-        layer_metadata.generation,
-    );
+    let remote_path = remote_layer_path(&tenant_id, &timeline_id, layer_file_name, layer_metadata);

    // Perform a rename inspired by durable_rename from file_utils.c.
    // The sequence:
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -864,11 +864,11 @@ impl DeltaLayerInner {
            expected_summary.index_start_blk = actual_summary.index_start_blk;
            expected_summary.index_root_blk = actual_summary.index_root_blk;
            if actual_summary != expected_summary {
-                // bail!(
-                //     "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
-                //     actual_summary,
-                //     expected_summary
-                // );
+                bail!(
+                    "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
+                    actual_summary,
+                    expected_summary
+                );
            }
        }

--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -457,11 +457,11 @@ impl ImageLayerInner {
            expected_summary.index_root_blk = actual_summary.index_root_blk;

            if actual_summary != expected_summary {
-                // bail!(
-                //     "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
-                //     actual_summary,
-                //     expected_summary
-                // );
+                bail!(
+                    "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
+                    actual_summary,
+                    expected_summary
+                );
            }
        }

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -38,7 +38,6 @@ use std::time::{Duration, Instant, SystemTime};
 use crate::context::{
    AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
 };
-use crate::deletion_queue::DeletionQueueClient;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
@@ -144,7 +143,6 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
    pub remote_client: Option<RemoteTimelineClient>,
-    pub deletion_queue_client: DeletionQueueClient,
 }

 pub struct Timeline {
@@ -523,23 +521,9 @@ impl Timeline {
        self.disk_consistent_lsn.load()
    }

-    /// remote_consistent_lsn from the perspective of the tenant's current generation,
-    /// not validated with control plane yet.
-    /// See [`Self::get_remote_consistent_lsn_visible`].
-    pub fn get_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
+    pub fn get_remote_consistent_lsn(&self) -> Option<Lsn> {
        if let Some(remote_client) = &self.remote_client {
-            remote_client.remote_consistent_lsn_projected()
-        } else {
-            None
-        }
-    }
-
-    /// remote_consistent_lsn which the tenant is guaranteed not to go backward from,
-    /// i.e. a value of remote_consistent_lsn_projected which has undergone
-    /// generation validation in the deletion queue.
-    pub fn get_remote_consistent_lsn_visible(&self) -> Option<Lsn> {
-        if let Some(remote_client) = &self.remote_client {
-            remote_client.remote_consistent_lsn_visible()
+            remote_client.last_uploaded_consistent_lsn()
        } else {
            None
        }
@@ -559,7 +543,7 @@ impl Timeline {
    }

    pub fn resident_physical_size(&self) -> u64 {
-        self.metrics.resident_physical_size_get()
+        self.metrics.resident_physical_size_gauge.get()
    }

    ///
@@ -655,38 +639,38 @@ impl Timeline {
    ) -> anyhow::Result<()> {
        const ROUNDS: usize = 2;

-        // static CONCURRENT_COMPACTIONS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
-        //     once_cell::sync::Lazy::new(|| {
-        //         let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
-        //         let permits = usize::max(
-        //             1,
-        //             // while a lot of the work is done on spawn_blocking, we still do
-        //             // repartitioning in the async context. this should give leave us some workers
-        //             // unblocked to be blocked on other work, hopefully easing any outside visible
-        //             // effects of restarts.
-        //             //
-        //             // 6/8 is a guess; previously we ran with unlimited 8 and more from
-        //             // spawn_blocking.
-        //             (total_threads * 3).checked_div(4).unwrap_or(0),
-        //         );
-        //         assert_ne!(permits, 0, "we will not be adding in permits later");
-        //         assert!(
-        //             permits < total_threads,
-        //             "need threads avail for shorter work"
-        //         );
-        //         tokio::sync::Semaphore::new(permits)
-        //     });
+        static CONCURRENT_COMPACTIONS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
+            once_cell::sync::Lazy::new(|| {
+                let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
+                let permits = usize::max(
+                    1,
+                    // while a lot of the work is done on spawn_blocking, we still do
+                    // repartitioning in the async context. this should give leave us some workers
+                    // unblocked to be blocked on other work, hopefully easing any outside visible
+                    // effects of restarts.
+                    //
+                    // 6/8 is a guess; previously we ran with unlimited 8 and more from
+                    // spawn_blocking.
+                    (total_threads * 3).checked_div(4).unwrap_or(0),
+                );
+                assert_ne!(permits, 0, "we will not be adding in permits later");
+                assert!(
+                    permits < total_threads,
+                    "need threads avail for shorter work"
+                );
+                tokio::sync::Semaphore::new(permits)
+            });

-        // // this wait probably never needs any "long time spent" logging, because we already nag if
-        // // compaction task goes over it's period (20s) which is quite often in production.
-        // let _permit = tokio::select! {
-        //     permit = CONCURRENT_COMPACTIONS.acquire() => {
-        //         permit
-        //     },
-        //     _ = cancel.cancelled() => {
-        //         return Ok(());
-        //     }
-        // };
+        // this wait probably never needs any "long time spent" logging, because we already nag if
+        // compaction task goes over it's period (20s) which is quite often in production.
+        let _permit = tokio::select! {
+            permit = CONCURRENT_COMPACTIONS.acquire() => {
+                permit
+            },
+            _ = cancel.cancelled() => {
+                return Ok(());
+            }
+        };

        let last_record_lsn = self.get_last_record_lsn();

@@ -1309,7 +1293,10 @@ impl Timeline {
        // will treat the file as a local layer again, count it towards resident size,
        // and it'll be like the layer removal never happened.
        // The bump in resident size is perhaps unexpected but overall a robust behavior.
-        self.metrics.resident_physical_size_sub(layer_file_size);
+        self.metrics
+            .resident_physical_size_gauge
+            .sub(layer_file_size);
+
        self.metrics.evictions.inc();

        if let Some(delta) = local_layer_residence_duration {
@@ -1833,7 +1820,7 @@ impl Timeline {
            for (layer, m) in needs_upload {
                rtc.schedule_layer_file_upload(&layer.layer_desc().filename(), &m)?;
            }
-            rtc.schedule_layer_file_deletion(needs_cleanup)?;
+            rtc.schedule_layer_file_deletion(&needs_cleanup)?;
            rtc.schedule_index_upload_for_file_changes()?;
            // Tenant::create_timeline will wait for these uploads to happen before returning, or
            // on retry.
@@ -1843,7 +1830,9 @@ impl Timeline {
            "loaded layer map with {} layers at {}, total physical size: {}",
            num_layers, disk_consistent_lsn, total_physical_size
        );
-        self.metrics.resident_physical_size_set(total_physical_size);
+        self.metrics
+            .resident_physical_size_gauge
+            .set(total_physical_size);

        timer.stop_and_record();
        Ok(())
@@ -3886,7 +3875,7 @@ impl Timeline {

        // Also schedule the deletions in remote storage
        if let Some(remote_client) = &self.remote_client {
-            remote_client.schedule_layer_file_deletion(layer_names_to_delete)?;
+            remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
        }

        Ok(())
@@ -4221,7 +4210,7 @@ impl Timeline {
            }

            if let Some(remote_client) = &self.remote_client {
-                remote_client.schedule_layer_file_deletion(layer_names_to_delete)?;
+                remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
            }

            apply.flush();
@@ -4393,7 +4382,7 @@ impl Timeline {

                    // XXX the temp file is still around in Err() case
                    // and consumes space until we clean up upon pageserver restart.
-                    self_clone.metrics.resident_physical_size_add(*size);
+                    self_clone.metrics.resident_physical_size_gauge.add(*size);

                    // Download complete. Replace the RemoteLayer with the corresponding
                    // Delta- or ImageLayer in the layer map.
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -14,7 +14,6 @@ use utils::{

 use crate::{
    config::PageServerConf,
-    deletion_queue::DeletionQueueClient,
    task_mgr::{self, TaskKind},
    tenant::{
        metadata::TimelineMetadata,
@@ -408,7 +407,6 @@ impl DeleteTimelineFlow {
        timeline_id: TimelineId,
        local_metadata: &TimelineMetadata,
        remote_client: Option<RemoteTimelineClient>,
-        deletion_queue_client: DeletionQueueClient,
        init_order: Option<&InitializationOrder>,
    ) -> anyhow::Result<()> {
        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
@@ -418,10 +416,7 @@ impl DeleteTimelineFlow {
                timeline_id,
                local_metadata,
                None, // Ancestor is not needed for deletion.
-                TimelineResources {
-                    remote_client,
-                    deletion_queue_client,
-                },
+                TimelineResources { remote_client },
                init_order,
                // Important. We dont pass ancestor above because it can be missing.
                // Thus we need to skip the validation here.
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -263,7 +263,7 @@ impl LayerManager {
        let desc = layer.layer_desc();
        if !layer.is_remote_layer() {
            layer.delete_resident_layer_file()?;
-            metrics.resident_physical_size_sub(desc.file_size);
+            metrics.resident_physical_size_gauge.sub(desc.file_size);
        }

        // TODO Removing from the bottom of the layer map is expensive.
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -370,9 +370,8 @@ pub(super) async fn handle_walreceiver_connection(
            })?;

        if let Some(last_lsn) = status_update {
-            let timeline_remote_consistent_lsn = timeline
-                .get_remote_consistent_lsn_visible()
-                .unwrap_or(Lsn(0));
+            let timeline_remote_consistent_lsn =
+                timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));

            // The last LSN we processed. It is not guaranteed to survive pageserver crash.
            let last_received_lsn = last_lsn;
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -1,3 +1,5 @@
+use crate::metrics::RemoteOpFileKind;
+
 use super::storage_layer::LayerFileName;
 use super::Generation;
 use crate::tenant::metadata::TimelineMetadata;
@@ -9,7 +11,6 @@ use std::fmt::Debug;
 use chrono::NaiveDateTime;
 use std::sync::Arc;
 use tracing::info;
-use utils::lsn::AtomicLsn;

 use std::sync::atomic::AtomicU32;
 use utils::lsn::Lsn;
@@ -57,12 +58,7 @@ pub(crate) struct UploadQueueInitialized {
    /// uploaded. `Lsn(0)` if nothing was uploaded yet.
    /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
    /// Safekeeper can rely on it to make decisions for WAL storage.
-    ///
-    /// visible_remote_consistent_lsn is only updated after our generation has been validated with
-    /// the control plane (unlesss a timeline's generation is None, in which case
-    /// we skip validation)
-    pub(crate) projected_remote_consistent_lsn: Option<Lsn>,
-    pub(crate) visible_remote_consistent_lsn: Arc<AtomicLsn>,
+    pub(crate) last_uploaded_consistent_lsn: Lsn,

    // Breakdown of different kinds of tasks currently in-progress
    pub(crate) num_inprogress_layer_uploads: usize,
@@ -85,14 +81,6 @@ impl UploadQueueInitialized {
    pub(super) fn no_pending_work(&self) -> bool {
        self.inprogress_tasks.is_empty() && self.queued_operations.is_empty()
    }
-
-    pub(super) fn get_last_remote_consistent_lsn_visible(&self) -> Lsn {
-        self.visible_remote_consistent_lsn.load()
-    }
-
-    pub(super) fn get_last_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
-        self.projected_remote_consistent_lsn
-    }
 }

 #[derive(Clone, Copy)]
@@ -126,8 +114,9 @@ impl UploadQueue {
            latest_files: HashMap::new(),
            latest_files_changes_since_metadata_upload_scheduled: 0,
            latest_metadata: metadata.clone(),
-            projected_remote_consistent_lsn: None,
-            visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
+            // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
+            // safekeepers from garbage-collecting anything.
+            last_uploaded_consistent_lsn: Lsn(0),
            // what follows are boring default initializations
            task_counter: 0,
            num_inprogress_layer_uploads: 0,
@@ -169,10 +158,7 @@ impl UploadQueue {
            latest_files: files,
            latest_files_changes_since_metadata_upload_scheduled: 0,
            latest_metadata: index_part.metadata.clone(),
-            projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()),
-            visible_remote_consistent_lsn: Arc::new(
-                index_part.metadata.disk_consistent_lsn().into(),
-            ),
+            last_uploaded_consistent_lsn: index_part.metadata.disk_consistent_lsn(),
            // what follows are boring default initializations
            task_counter: 0,
            num_inprogress_layer_uploads: 0,
@@ -215,11 +201,12 @@ pub(crate) struct UploadTask {
    pub(crate) op: UploadOp,
 }

-/// A deletion of some layers within the lifetime of a timeline.  This is not used
-/// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
 #[derive(Debug)]
 pub(crate) struct Delete {
-    pub(crate) layers: Vec<(LayerFileName, Generation)>,
+    pub(crate) file_kind: RemoteOpFileKind,
+    pub(crate) layer_file_name: LayerFileName,
+    pub(crate) scheduled_from_timeline_delete: bool,
+    pub(crate) generation: Generation,
 }

 #[derive(Debug)]
@@ -230,7 +217,7 @@ pub(crate) enum UploadOp {
    /// Upload the metadata file
    UploadMetadata(IndexPart, Lsn),

-    /// Delete layer files
+    /// Delete a layer file
    Delete(Delete),

    /// Barrier. When the barrier operation is reached,
@@ -252,9 +239,13 @@ impl std::fmt::Display for UploadOp {
            UploadOp::UploadMetadata(_, lsn) => {
                write!(f, "UploadMetadata(lsn: {})", lsn)
            }
-            UploadOp::Delete(delete) => {
-                write!(f, "Delete({} layers)", delete.layers.len(),)
-            }
+            UploadOp::Delete(delete) => write!(
+                f,
+                "Delete(path: {}, scheduled_from_timeline_delete: {}, gen: {:?})",
+                delete.layer_file_name.file_name(),
+                delete.scheduled_from_timeline_delete,
+                delete.generation
+            ),
            UploadOp::Barrier(_) => write!(f, "Barrier"),
        }
    }
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -18,8 +18,7 @@ use std::io::{Error, ErrorKind, Seek, SeekFrom};
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
-use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
-use tokio::time::Instant;
+use std::sync::{RwLock, RwLockWriteGuard};

 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
@@ -111,7 +110,7 @@ impl OpenFiles {
    ///
    /// On return, we hold a lock on the slot, and its 'tag' has been updated
    /// recently_used has been set. It's all ready for reuse.
-    async fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
+    fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
        //
        // Run the clock algorithm to find a slot to replace.
        //
@@ -143,7 +142,7 @@ impl OpenFiles {
                }
                retries += 1;
            } else {
-                slot_guard = slot.inner.write().await;
+                slot_guard = slot.inner.write().unwrap();
                index = next;
                break;
            }
@@ -154,7 +153,7 @@ impl OpenFiles {
        // old file.
        //
        if let Some(old_file) = slot_guard.file.take() {
-            // the normal path of dropping VirtualFile uses `Close`, use `CloseByReplace` here to
+            // the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to
            // distinguish the two.
            STORAGE_IO_TIME_METRIC
                .get(StorageIoOperation::CloseByReplace)
@@ -209,29 +208,6 @@ impl CrashsafeOverwriteError {
    }
 }

-/// Observe duration for the given storage I/O operation
-///
-/// Unlike `observe_closure_duration`, this supports async,
-/// where "support" means that we measure wall clock time.
-macro_rules! observe_duration {
-    ($op:expr, $($body:tt)*) => {{
-        let instant = Instant::now();
-        let result = $($body)*;
-        let elapsed = instant.elapsed().as_secs_f64();
-        STORAGE_IO_TIME_METRIC
-            .get($op)
-            .observe(elapsed);
-        result
-    }}
-}
-
-macro_rules! with_file {
-    ($this:expr, $op:expr, | $ident:ident | $($body:tt)*) => {{
-        let $ident = $this.lock_file().await?;
-        observe_duration!($op, $($body)*)
-    }};
-}
-
 impl VirtualFile {
    /// Open a file in read-only mode. Like File::open.
    pub async fn open(path: &Path) -> Result<VirtualFile, std::io::Error> {
@@ -268,9 +244,11 @@ impl VirtualFile {
            tenant_id = "*".to_string();
            timeline_id = "*".to_string();
        }
-        let (handle, mut slot_guard) = get_open_files().find_victim_slot().await;
+        let (handle, mut slot_guard) = get_open_files().find_victim_slot();

-        let file = observe_duration!(StorageIoOperation::Open, open_options.open(path))?;
+        let file = STORAGE_IO_TIME_METRIC
+            .get(StorageIoOperation::Open)
+            .observe_closure_duration(|| open_options.open(path))?;

        // Strip all options other than read and write.
        //
@@ -353,24 +331,22 @@ impl VirtualFile {

    /// Call File::sync_all() on the underlying File.
    pub async fn sync_all(&self) -> Result<(), Error> {
-        with_file!(self, StorageIoOperation::Fsync, |file| file
-            .as_ref()
-            .sync_all())
+        self.with_file(StorageIoOperation::Fsync, |file| file.sync_all())
+            .await?
    }

    pub async fn metadata(&self) -> Result<fs::Metadata, Error> {
-        with_file!(self, StorageIoOperation::Metadata, |file| file
-            .as_ref()
-            .metadata())
+        self.with_file(StorageIoOperation::Metadata, |file| file.metadata())
+            .await?
    }

-    /// Helper function internal to `VirtualFile` that looks up the underlying File,
-    /// opens it and evicts some other File if necessary. The passed parameter is
-    /// assumed to be a function available for the physical `File`.
-    ///
-    /// We are doing it via a macro as Rust doesn't support async closures that
-    /// take on parameters with lifetimes.
-    async fn lock_file(&self) -> Result<FileGuard<'_>, Error> {
+    /// Helper function that looks up the underlying File for this VirtualFile,
+    /// opening it and evicting some other File if necessary. It calls 'func'
+    /// with the physical File.
+    async fn with_file<F, R>(&self, op: StorageIoOperation, mut func: F) -> Result<R, Error>
+    where
+        F: FnMut(&File) -> R,
+    {
        let open_files = get_open_files();

        let mut handle_guard = {
@@ -380,23 +356,27 @@ impl VirtualFile {
            // We only need to hold the handle lock while we read the current handle. If
            // another thread closes the file and recycles the slot for a different file,
            // we will notice that the handle we read is no longer valid and retry.
-            let mut handle = *self.handle.read().await;
+            let mut handle = *self.handle.read().unwrap();
            loop {
                // Check if the slot contains our File
                {
                    let slot = &open_files.slots[handle.index];
-                    let slot_guard = slot.inner.read().await;
-                    if slot_guard.tag == handle.tag && slot_guard.file.is_some() {
-                        // Found a cached file descriptor.
-                        slot.recently_used.store(true, Ordering::Relaxed);
-                        return Ok(FileGuard { slot_guard });
+                    let slot_guard = slot.inner.read().unwrap();
+                    if slot_guard.tag == handle.tag {
+                        if let Some(file) = &slot_guard.file {
+                            // Found a cached file descriptor.
+                            slot.recently_used.store(true, Ordering::Relaxed);
+                            return Ok(STORAGE_IO_TIME_METRIC
+                                .get(op)
+                                .observe_closure_duration(|| func(file)));
+                        }
                    }
                }

                // The slot didn't contain our File. We will have to open it ourselves,
                // but before that, grab a write lock on handle in the VirtualFile, so
                // that no other thread will try to concurrently open the same file.
-                let handle_guard = self.handle.write().await;
+                let handle_guard = self.handle.write().unwrap();

                // If another thread changed the handle while we were not holding the lock,
                // then the handle might now be valid again. Loop back to retry.
@@ -410,10 +390,17 @@ impl VirtualFile {

        // We need to open the file ourselves. The handle in the VirtualFile is
        // now locked in write-mode. Find a free slot to put it in.
-        let (handle, mut slot_guard) = open_files.find_victim_slot().await;
+        let (handle, mut slot_guard) = open_files.find_victim_slot();

        // Open the physical file
-        let file = observe_duration!(StorageIoOperation::Open, self.open_options.open(&self.path))?;
+        let file = STORAGE_IO_TIME_METRIC
+            .get(StorageIoOperation::Open)
+            .observe_closure_duration(|| self.open_options.open(&self.path))?;
+
+        // Perform the requested operation on it
+        let result = STORAGE_IO_TIME_METRIC
+            .get(op)
+            .observe_closure_duration(|| func(&file));

        // Store the File in the slot and update the handle in the VirtualFile
        // to point to it.
@@ -421,9 +408,7 @@ impl VirtualFile {

        *handle_guard = handle;

-        return Ok(FileGuard {
-            slot_guard: slot_guard.downgrade(),
-        });
+        Ok(result)
    }

    pub fn remove(self) {
@@ -438,9 +423,11 @@ impl VirtualFile {
                self.pos = offset;
            }
            SeekFrom::End(offset) => {
-                self.pos = with_file!(self, StorageIoOperation::Seek, |file| file
-                    .as_ref()
-                    .seek(SeekFrom::End(offset)))?
+                self.pos = self
+                    .with_file(StorageIoOperation::Seek, |mut file| {
+                        file.seek(SeekFrom::End(offset))
+                    })
+                    .await??
            }
            SeekFrom::Current(offset) => {
                let pos = self.pos as i128 + offset as i128;
@@ -528,9 +515,9 @@ impl VirtualFile {
    }

    pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
-        let result = with_file!(self, StorageIoOperation::Read, |file| file
-            .as_ref()
-            .read_at(buf, offset));
+        let result = self
+            .with_file(StorageIoOperation::Read, |file| file.read_at(buf, offset))
+            .await?;
        if let Ok(size) = result {
            STORAGE_IO_SIZE
                .with_label_values(&["read", &self.tenant_id, &self.timeline_id])
@@ -540,9 +527,9 @@ impl VirtualFile {
    }

    async fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
-        let result = with_file!(self, StorageIoOperation::Write, |file| file
-            .as_ref()
-            .write_at(buf, offset));
+        let result = self
+            .with_file(StorageIoOperation::Write, |file| file.write_at(buf, offset))
+            .await?;
        if let Ok(size) = result {
            STORAGE_IO_SIZE
                .with_label_values(&["write", &self.tenant_id, &self.timeline_id])
@@ -552,18 +539,6 @@ impl VirtualFile {
    }
 }

-struct FileGuard<'a> {
-    slot_guard: RwLockReadGuard<'a, SlotInner>,
-}
-
-impl<'a> AsRef<File> for FileGuard<'a> {
-    fn as_ref(&self) -> &File {
-        // This unwrap is safe because we only create `FileGuard`s
-        // if we know that the file is Some.
-        self.slot_guard.file.as_ref().unwrap()
-    }
-}
-
 #[cfg(test)]
 impl VirtualFile {
    pub(crate) async fn read_blk(
@@ -596,39 +571,20 @@ impl VirtualFile {
 impl Drop for VirtualFile {
    /// If a VirtualFile is dropped, close the underlying file if it was open.
    fn drop(&mut self) {
-        let handle = self.handle.get_mut();
+        let handle = self.handle.get_mut().unwrap();

-        fn clean_slot(slot: &Slot, mut slot_guard: RwLockWriteGuard<'_, SlotInner>, tag: u64) {
-            if slot_guard.tag == tag {
-                slot.recently_used.store(false, Ordering::Relaxed);
-                // there is also the `CloseByReplace` operation for closes done on eviction for
-                // comparison.
-                STORAGE_IO_TIME_METRIC
-                    .get(StorageIoOperation::Close)
-                    .observe_closure_duration(|| drop(slot_guard.file.take()));
-            }
-        }
-
-        // We don't have async drop so we cannot directly await the lock here.
-        // Instead, first do a best-effort attempt at closing the underlying
-        // file descriptor by using `try_write`, and if that fails, spawn
-        // a tokio task to do it asynchronously: we just want it to be
-        // cleaned up eventually.
-        // Most of the time, the `try_lock` should succeed though,
-        // as we have `&mut self` access. In other words, if the slot
-        // is still occupied by our file, there should be no access from
-        // other I/O operations; the only other possible place to lock
-        // the slot is the lock algorithm looking for free slots.
+        // We could check with a read-lock first, to avoid waiting on an
+        // unrelated I/O.
        let slot = &get_open_files().slots[handle.index];
-        if let Ok(slot_guard) = slot.inner.try_write() {
-            clean_slot(slot, slot_guard, handle.tag);
-        } else {
-            let tag = handle.tag;
-            tokio::spawn(async move {
-                let slot_guard = slot.inner.write().await;
-                clean_slot(slot, slot_guard, tag);
-            });
-        };
+        let mut slot_guard = slot.inner.write().unwrap();
+        if slot_guard.tag == handle.tag {
+            slot.recently_used.store(false, Ordering::Relaxed);
+            // there is also operation "close-by-replace" for closes done on eviction for
+            // comparison.
+            STORAGE_IO_TIME_METRIC
+                .get(StorageIoOperation::Close)
+                .observe_closure_duration(|| drop(slot_guard.file.take()));
+        }
    }
 }

--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -741,13 +741,6 @@ NeonProcessUtility(
 			break;
 		case T_DropdbStmt:
 			HandleDropDb(castNode(DropdbStmt, parseTree));
-			/*
-			 * We do this here to hack around the fact that Postgres performs the drop
-			 * INSIDE of standard_ProcessUtility, which means that if we try to
-			 * abort the drop normally it'll be too late. DROP DATABASE can't be inside
-			 * of a transaction block anyway, so this should be fine to do.
-			 */
-			NeonXactCallback(XACT_EVENT_PRE_COMMIT, NULL);
 			break;
 		case T_CreateRoleStmt:
 			HandleCreateRole(castNode(CreateRoleStmt, parseTree));
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -14,6 +14,7 @@
 */

 #include <sys/file.h>
+#include <sys/statvfs.h>
 #include <unistd.h>
 #include <fcntl.h>

@@ -37,6 +38,9 @@
 #include "storage/fd.h"
 #include "storage/pg_shmem.h"
 #include "storage/buf_internals.h"
+#include "storage/procsignal.h"
+#include "postmaster/bgworker.h"
+#include "postmaster/interrupt.h"

 /*
 * Local file cache is used to temporary store relations pages in local file system.
@@ -62,6 +66,9 @@

 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))

+#define MAX_MONITOR_INTERVAL_USEC 1000000 /* 1 second */
+#define MAX_DISK_WRITE_RATE       1000 /* MB/sec */
+
 typedef struct FileCacheEntry
 {
 	BufferTag	key;
@@ -84,12 +91,14 @@ static int   lfc_desc = 0;
 static LWLockId lfc_lock;
 static int   lfc_max_size;
 static int   lfc_size_limit;
+static int   lfc_free_space_watermark;
 static char* lfc_path;
 static  FileCacheControl* lfc_ctl;
 static shmem_startup_hook_type prev_shmem_startup_hook;
 #if PG_VERSION_NUM>=150000
 static shmem_request_hook_type prev_shmem_request_hook;
 #endif
+static int   lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark is reached */

 void FileCacheMonitorMain(Datum main_arg);

@@ -245,6 +254,80 @@ lfc_change_limit_hook(int newval, void *extra)
 	LWLockRelease(lfc_lock);
 }

+/*
+ * Local file system state monitor check available free space.
+ * If it is lower than lfc_free_space_watermark then we shrink size of local cache
+ * but throwing away least recently accessed chunks.
+ * First time low space watermark is reached cache size is divided by two,
+ * second time by four,... Finally we remove all chunks from local cache.
+ *
+ * Please notice that we are not changing lfc_cache_size: it is used to be adjusted by autoscaler.
+ * We only throw away cached chunks but do not prevent from filling cache by new chunks.
+ *
+ * Interval of poooling cache state is calculated as minimal time needed to consume lfc_free_space_watermark
+ * disk space with maximal possible disk write speed (1Gb/sec). But not larger than 1 second.
+ * Calling statvfs each second should not add any noticeable overhead.
+ */
+void
+FileCacheMonitorMain(Datum main_arg)
+{
+	/*
+	 * Choose file system state monitor interval so that space can not be exosted
+	 * during this period but not longer than  MAX_MONITOR_INTERVAL (10 sec)
+	 */
+	uint64 monitor_interval = Min(MAX_MONITOR_INTERVAL_USEC, lfc_free_space_watermark*MB/MAX_DISK_WRITE_RATE);
+
+	/* Establish signal handlers. */
+	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+	pqsignal(SIGHUP, SignalHandlerForConfigReload);
+	pqsignal(SIGTERM, SignalHandlerForShutdownRequest);
+	BackgroundWorkerUnblockSignals();
+
+	/* Periodically dump buffers until terminated. */
+	while (!ShutdownRequestPending)
+	{
+		if (lfc_size_limit != 0)
+		{
+			struct statvfs sfs;
+			if (statvfs(lfc_path, &sfs) < 0)
+			{
+				elog(WARNING, "Failed to obtain status of %s: %m", lfc_path);
+			}
+			else
+			{
+				if (sfs.f_bavail*sfs.f_bsize < lfc_free_space_watermark*MB)
+				{
+					if (lfc_shrinking_factor < 31) {
+						lfc_shrinking_factor += 1;
+					}
+					lfc_change_limit_hook(lfc_size_limit >> lfc_shrinking_factor, NULL);
+				}
+				else
+					lfc_shrinking_factor = 0; /* reset to initial value */
+			}
+		}
+		pg_usleep(monitor_interval);
+	}
+}
+
+static void
+lfc_register_free_space_monitor(void)
+{
+	BackgroundWorker bgw;
+	memset(&bgw, 0, sizeof(bgw));
+	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
+	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
+	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "FileCacheMonitorMain");
+	snprintf(bgw.bgw_name, BGW_MAXLEN, "Local free space monitor");
+	snprintf(bgw.bgw_type, BGW_MAXLEN, "Local free space monitor");
+	bgw.bgw_restart_time = 5;
+	bgw.bgw_notify_pid = 0;
+	bgw.bgw_main_arg = (Datum) 0;
+
+	RegisterBackgroundWorker(&bgw);
+}
+
 void
 lfc_init(void)
 {
@@ -281,6 +364,19 @@ lfc_init(void)
 							lfc_change_limit_hook,
 							NULL);

+	DefineCustomIntVariable("neon.free_space_watermark",
+							"Minimal free space in local file system after reaching which local file cache will be truncated",
+							NULL,
+							&lfc_free_space_watermark,
+							1024, /* 1GB */
+							0,
+							INT_MAX,
+							PGC_SIGHUP,
+							GUC_UNIT_MB,
+							NULL,
+							NULL,
+							NULL);
+
 	DefineCustomStringVariable("neon.file_cache_path",
 							   "Path to local file cache (can be raw device)",
 							   NULL,
@@ -295,6 +391,9 @@ lfc_init(void)
 	if (lfc_max_size == 0)
 		return;

+	if (lfc_free_space_watermark != 0)
+		lfc_register_free_space_monitor();
+
 	prev_shmem_startup_hook = shmem_startup_hook;
 	shmem_startup_hook = lfc_shmem_startup;
 #if PG_VERSION_NUM>=150000
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1790,14 +1790,6 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	if (!XLogInsertAllowed())
 		return;

-	/* ensure we have enough xlog buffers to log max-sized records */
-	XLogEnsureRecordSpace(Min(remblocks, (XLR_MAX_BLOCK_ID - 1)), 0);
-
-	/*
-	 * Iterate over all the pages. They are collected into batches of
-	 * XLR_MAX_BLOCK_ID pages, and a single WAL-record is written for each
-	 * batch.
-	 */
 	while (remblocks > 0)
 	{
 		int			count = Min(remblocks, XLR_MAX_BLOCK_ID);
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -42,7 +42,6 @@ reqwest-middleware.workspace = true
 reqwest-retry.workspace = true
 reqwest-tracing.workspace = true
 routerify.workspace = true
-rustc-hash.workspace = true
 rustls-pemfile.workspace = true
 rustls.workspace = true
 scopeguard.workspace = true
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -160,19 +160,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
            Test(_) => Some("test".to_owned()),
        }
    }
-
-    /// Get username from the credentials.
-    pub fn get_user(&self) -> &str {
-        use BackendType::*;
-
-        match self {
-            Console(_, creds) => creds.user,
-            Postgres(_, creds) => creds.user,
-            Link(_) => "link",
-            Test(_) => "test",
-        }
-    }
-
    /// Authenticate the client via the requested backend, possibly using credentials.
    #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
    pub async fn authenticate(
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -17,12 +17,11 @@ use std::{
 use tokio::time;
 use tokio_postgres::AsyncMessage;

-use crate::{
-    auth, console,
-    metrics::{Ids, MetricCounter, USAGE_METRICS},
-};
+use crate::{auth, console};
 use crate::{compute, config};

+use super::sql_over_http::MAX_RESPONSE_SIZE;
+
 use crate::proxy::ConnectMechanism;

 use tracing::{error, warn};
@@ -401,6 +400,7 @@ async fn connect_to_compute_once(
        .user(&conn_info.username)
        .password(&conn_info.password)
        .dbname(&conn_info.dbname)
+        .max_backend_message_size(MAX_RESPONSE_SIZE)
        .connect_timeout(timeout)
        .connect(tokio_postgres::NoTls)
        .await?;
@@ -412,10 +412,6 @@ async fn connect_to_compute_once(
    span.in_scope(|| {
        info!(%conn_info, %session, "new connection");
    });
-    let ids = Ids {
-        endpoint_id: node_info.aux.endpoint_id.to_string(),
-        branch_id: node_info.aux.branch_id.to_string(),
-    };

    tokio::spawn(
        poll_fn(move |cx| {
@@ -454,18 +450,10 @@ async fn connect_to_compute_once(
    Ok(Client {
        inner: client,
        session: tx,
-        ids,
    })
 }

 pub struct Client {
    pub inner: tokio_postgres::Client,
    session: tokio::sync::watch::Sender<uuid::Uuid>,
-    ids: Ids,
-}
-
-impl Client {
-    pub fn metrics(&self) -> Arc<MetricCounter> {
-        USAGE_METRICS.register(self.ids.clone())
-    }
 }
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -3,12 +3,10 @@ use std::sync::Arc;
 use anyhow::bail;
 use futures::pin_mut;
 use futures::StreamExt;
+use hashbrown::HashMap;
 use hyper::body::HttpBody;
-use hyper::header;
 use hyper::http::HeaderName;
 use hyper::http::HeaderValue;
-use hyper::Response;
-use hyper::StatusCode;
 use hyper::{Body, HeaderMap, Request};
 use serde_json::json;
 use serde_json::Map;
@@ -18,11 +16,7 @@ use tokio_postgres::types::Type;
 use tokio_postgres::GenericClient;
 use tokio_postgres::IsolationLevel;
 use tokio_postgres::Row;
-use tracing::error;
-use tracing::instrument;
 use url::Url;
-use utils::http::error::ApiError;
-use utils::http::json::json_response;

 use super::conn_pool::ConnInfo;
 use super::conn_pool::GlobalConnPool;
@@ -45,6 +39,7 @@ enum Payload {
    Batch(BatchQueryData),
 }

+pub const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MB
 const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB

 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
@@ -187,45 +182,7 @@ pub async fn handle(
    sni_hostname: Option<String>,
    conn_pool: Arc<GlobalConnPool>,
    session_id: uuid::Uuid,
-) -> Result<Response<Body>, ApiError> {
-    let result = handle_inner(request, sni_hostname, conn_pool, session_id).await;
-
-    let mut response = match result {
-        Ok(r) => r,
-        Err(e) => {
-            let message = format!("{:?}", e);
-            let code = match e.downcast_ref::<tokio_postgres::Error>() {
-                Some(e) => match e.code() {
-                    Some(e) => serde_json::to_value(e.code()).unwrap(),
-                    None => Value::Null,
-                },
-                None => Value::Null,
-            };
-            error!(
-                ?code,
-                "sql-over-http per-client task finished with an error: {e:#}"
-            );
-            // TODO: this shouldn't always be bad request.
-            json_response(
-                StatusCode::BAD_REQUEST,
-                json!({ "message": message, "code": code }),
-            )?
-        }
-    };
-    response.headers_mut().insert(
-        "Access-Control-Allow-Origin",
-        hyper::http::HeaderValue::from_static("*"),
-    );
-    Ok(response)
-}
-
-#[instrument(name = "sql-over-http", skip_all)]
-async fn handle_inner(
-    request: Request<Body>,
-    sni_hostname: Option<String>,
-    conn_pool: Arc<GlobalConnPool>,
-    session_id: uuid::Uuid,
-) -> anyhow::Result<Response<Body>> {
+) -> anyhow::Result<(Value, HashMap<HeaderName, HeaderValue>)> {
    //
    // Determine the destination and connection params
    //
@@ -276,18 +233,13 @@ async fn handle_inner(

    let mut client = conn_pool.get(&conn_info, !allow_pool, session_id).await?;

-    let mut response = Response::builder()
-        .status(StatusCode::OK)
-        .header(header::CONTENT_TYPE, "application/json");
-
    //
    // Now execute the query and return the result
    //
-    let mut size = 0;
    let result = match payload {
-        Payload::Single(query) => {
-            query_to_json(&client.inner, query, &mut size, raw_output, array_mode).await
-        }
+        Payload::Single(query) => query_to_json(&client.inner, query, raw_output, array_mode)
+            .await
+            .map(|x| (x, HashMap::default())),
        Payload::Batch(batch_query) => {
            let mut results = Vec::new();
            let mut builder = client.inner.build_transaction();
@@ -302,8 +254,7 @@ async fn handle_inner(
            }
            let transaction = builder.start().await?;
            for query in batch_query.queries {
-                let result =
-                    query_to_json(&transaction, query, &mut size, raw_output, array_mode).await;
+                let result = query_to_json(&transaction, query, raw_output, array_mode).await;
                match result {
                    Ok(r) => results.push(r),
                    Err(e) => {
@@ -313,27 +264,26 @@ async fn handle_inner(
                }
            }
            transaction.commit().await?;
+            let mut headers = HashMap::default();
            if txn_read_only {
-                response = response.header(
+                headers.insert(
                    TXN_READ_ONLY.clone(),
                    HeaderValue::try_from(txn_read_only.to_string())?,
                );
            }
            if txn_deferrable {
-                response = response.header(
+                headers.insert(
                    TXN_DEFERRABLE.clone(),
                    HeaderValue::try_from(txn_deferrable.to_string())?,
                );
            }
            if let Some(txn_isolation_level) = txn_isolation_level_raw {
-                response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
+                headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
            }
-            Ok(json!({ "results": results }))
+            Ok((json!({ "results": results }), headers))
        }
    };

-    let metrics = client.metrics();
-
    if allow_pool {
        let current_span = tracing::Span::current();
        // return connection to the pool
@@ -343,30 +293,12 @@ async fn handle_inner(
        });
    }

-    match result {
-        Ok(value) => {
-            // how could this possibly fail
-            let body = serde_json::to_string(&value).expect("json serialization should not fail");
-            let len = body.len();
-            let response = response
-                .body(Body::from(body))
-                // only fails if invalid status code or invalid header/values are given.
-                // these are not user configurable so it cannot fail dynamically
-                .expect("building response payload should not fail");
-
-            // count the egress bytes - we miss the TLS and header overhead but oh well...
-            // moving this later in the stack is going to be a lot of effort and ehhhh
-            metrics.record_egress(len as u64);
-            Ok(response)
-        }
-        Err(e) => Err(e),
-    }
+    result
 }

 async fn query_to_json<T: GenericClient>(
    client: &T,
    data: QueryData,
-    current_size: &mut usize,
    raw_output: bool,
    array_mode: bool,
 ) -> anyhow::Result<Value> {
@@ -380,10 +312,16 @@ async fn query_to_json<T: GenericClient>(
    // big.
    pin_mut!(row_stream);
    let mut rows: Vec<tokio_postgres::Row> = Vec::new();
+    let mut current_size = 0;
    while let Some(row) = row_stream.next().await {
        let row = row?;
-        *current_size += row.body_len();
+        current_size += row.body_len();
        rows.push(row);
+        if current_size > MAX_RESPONSE_SIZE {
+            return Err(anyhow::anyhow!(
+                "response is too large (max is {MAX_RESPONSE_SIZE} bytes)"
+            ));
+        }
    }

    // grab the command tag and number of rows affected
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -7,6 +7,7 @@ use crate::{
 };
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream, StreamExt};
+use hashbrown::HashMap;
 use hyper::{
    server::{
        accept,
@@ -17,6 +18,7 @@ use hyper::{
 };
 use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
 use pin_project_lite::pin_project;
+use serde_json::{json, Value};

 use std::{
    convert::Infallible,
@@ -202,7 +204,44 @@ async fn ws_handler(
    // TODO: that deserves a refactor as now this function also handles http json client besides websockets.
    // Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
-        sql_over_http::handle(request, sni_hostname, conn_pool, session_id).await
+        let result = sql_over_http::handle(request, sni_hostname, conn_pool, session_id)
+            .instrument(info_span!("sql-over-http"))
+            .await;
+        let status_code = match result {
+            Ok(_) => StatusCode::OK,
+            Err(_) => StatusCode::BAD_REQUEST,
+        };
+        let (json, headers) = match result {
+            Ok(r) => r,
+            Err(e) => {
+                let message = format!("{:?}", e);
+                let code = match e.downcast_ref::<tokio_postgres::Error>() {
+                    Some(e) => match e.code() {
+                        Some(e) => serde_json::to_value(e.code()).unwrap(),
+                        None => Value::Null,
+                    },
+                    None => Value::Null,
+                };
+                error!(
+                    ?code,
+                    "sql-over-http per-client task finished with an error: {e:#}"
+                );
+                (
+                    json!({ "message": message, "code": code }),
+                    HashMap::default(),
+                )
+            }
+        };
+        json_response(status_code, json).map(|mut r| {
+            r.headers_mut().insert(
+                "Access-Control-Allow-Origin",
+                hyper::http::HeaderValue::from_static("*"),
+            );
+            for (k, v) in headers {
+                r.headers_mut().insert(k, v);
+            }
+            r
+        })
    } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
        Response::builder()
            .header("Allow", "OPTIONS, POST")
@@ -214,7 +253,7 @@ async fn ws_handler(
            .header("Access-Control-Max-Age", "86400" /* 24 hours */)
            .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
            .body(Body::empty())
-            .map_err(|e| ApiError::InternalServerError(e.into()))
+            .map_err(|e| ApiError::BadRequest(e.into()))
    } else {
        json_response(StatusCode::BAD_REQUEST, "query is not supported")
    }
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -3,18 +3,9 @@
 use crate::{config::MetricCollectionConfig, http};
 use chrono::{DateTime, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
-use dashmap::{mapref::entry::Entry, DashMap};
-use once_cell::sync::Lazy;
-use serde::{Deserialize, Serialize};
-use std::{
-    convert::Infallible,
-    sync::{
-        atomic::{AtomicU64, AtomicUsize, Ordering},
-        Arc,
-    },
-    time::Duration,
-};
-use tracing::{error, info, instrument, trace};
+use serde::Serialize;
+use std::{collections::HashMap, convert::Infallible, time::Duration};
+use tracing::{error, info, instrument, trace, warn};

 const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";

@@ -27,95 +18,12 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
 /// Both the proxy and the ingestion endpoint will live in the same region (or cell)
 /// so while the project-id is unique across regions the whole pipeline will work correctly
 /// because we enrich the event with project_id in the control-plane endpoint.
-#[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)]
+#[derive(Eq, Hash, PartialEq, Serialize, Debug, Clone)]
 pub struct Ids {
    pub endpoint_id: String,
    pub branch_id: String,
 }

-#[derive(Debug)]
-pub struct MetricCounter {
-    transmitted: AtomicU64,
-    opened_connections: AtomicUsize,
-}
-
-impl MetricCounter {
-    /// Record that some bytes were sent from the proxy to the client
-    pub fn record_egress(&self, bytes: u64) {
-        self.transmitted.fetch_add(bytes, Ordering::AcqRel);
-    }
-
-    /// extract the value that should be reported
-    fn should_report(self: &Arc<Self>) -> Option<u64> {
-        // heuristic to see if the branch is still open
-        // if a clone happens while we are observing, the heuristic will be incorrect.
-        //
-        // Worst case is that we won't report an event for this endpoint.
-        // However, for the strong count to be 1 it must have occured that at one instant
-        // all the endpoints were closed, so missing a report because the endpoints are closed is valid.
-        let is_open = Arc::strong_count(self) > 1;
-        let opened = self.opened_connections.swap(0, Ordering::AcqRel);
-
-        // update cached metrics eagerly, even if they can't get sent
-        // (to avoid sending the same metrics twice)
-        // see the relevant discussion on why to do so even if the status is not success:
-        // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956
-        let value = self.transmitted.swap(0, Ordering::AcqRel);
-
-        // Our only requirement is that we report in every interval if there was an open connection
-        // if there were no opened connections since, then we don't need to report
-        if value == 0 && !is_open && opened == 0 {
-            None
-        } else {
-            Some(value)
-        }
-    }
-
-    /// Determine whether the counter should be cleared from the global map.
-    fn should_clear(self: &mut Arc<Self>) -> bool {
-        // we can't clear this entry if it's acquired elsewhere
-        let Some(counter) = Arc::get_mut(self) else {
-            return false;
-        };
-        let opened = *counter.opened_connections.get_mut();
-        let value = *counter.transmitted.get_mut();
-        // clear if there's no data to report
-        value == 0 && opened == 0
-    }
-}
-
-// endpoint and branch IDs are not user generated so we don't run the risk of hash-dos
-type FastHasher = std::hash::BuildHasherDefault<rustc_hash::FxHasher>;
-
-#[derive(Default)]
-pub struct Metrics {
-    endpoints: DashMap<Ids, Arc<MetricCounter>, FastHasher>,
-}
-
-impl Metrics {
-    /// Register a new byte metrics counter for this endpoint
-    pub fn register(&self, ids: Ids) -> Arc<MetricCounter> {
-        let entry = if let Some(entry) = self.endpoints.get(&ids) {
-            entry.clone()
-        } else {
-            self.endpoints
-                .entry(ids)
-                .or_insert_with(|| {
-                    Arc::new(MetricCounter {
-                        transmitted: AtomicU64::new(0),
-                        opened_connections: AtomicUsize::new(0),
-                    })
-                })
-                .clone()
-        };
-
-        entry.opened_connections.fetch_add(1, Ordering::AcqRel);
-        entry
-    }
-}
-
-pub static USAGE_METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
-
 pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infallible> {
    info!("metrics collector config: {config:?}");
    scopeguard::defer! {
@@ -123,83 +31,145 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
    }

    let http_client = http::new_client_with_timeout(DEFAULT_HTTP_REPORTING_TIMEOUT);
+    let mut cached_metrics: HashMap<Ids, (u64, DateTime<Utc>)> = HashMap::new();
    let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();

-    let mut prev = Utc::now();
    let mut ticker = tokio::time::interval(config.interval);
    loop {
        ticker.tick().await;

-        let now = Utc::now();
-        collect_metrics_iteration(
-            &USAGE_METRICS,
+        let res = collect_metrics_iteration(
            &http_client,
+            &mut cached_metrics,
            &config.endpoint,
            &hostname,
-            prev,
-            now,
        )
        .await;
-        prev = now;
+
+        match res {
+            Err(e) => error!("failed to send consumption metrics: {e} "),
+            Ok(_) => trace!("periodic metrics collection completed successfully"),
+        }
    }
 }

+fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
+    let mut current_metrics: Vec<(Ids, (u64, DateTime<Utc>))> = Vec::new();
+    let metrics = prometheus::default_registry().gather();
+
+    for m in metrics {
+        if m.get_name() == "proxy_io_bytes_per_client" {
+            for ms in m.get_metric() {
+                let direction = ms
+                    .get_label()
+                    .iter()
+                    .find(|l| l.get_name() == "direction")
+                    .unwrap()
+                    .get_value();
+
+                // Only collect metric for outbound traffic
+                if direction == "tx" {
+                    let endpoint_id = ms
+                        .get_label()
+                        .iter()
+                        .find(|l| l.get_name() == "endpoint_id")
+                        .unwrap()
+                        .get_value();
+                    let branch_id = ms
+                        .get_label()
+                        .iter()
+                        .find(|l| l.get_name() == "branch_id")
+                        .unwrap()
+                        .get_value();
+
+                    let value = ms.get_counter().get_value() as u64;
+
+                    // Report if the metric value is suspiciously large
+                    if value > (1u64 << 40) {
+                        warn!(
+                            "potentially abnormal counter value: branch_id {} endpoint_id {} val: {}",
+                            branch_id, endpoint_id, value
+                        );
+                    }
+
+                    current_metrics.push((
+                        Ids {
+                            endpoint_id: endpoint_id.to_string(),
+                            branch_id: branch_id.to_string(),
+                        },
+                        (value, Utc::now()),
+                    ));
+                }
+            }
+        }
+    }
+
+    current_metrics
+}
+
 #[instrument(skip_all)]
 async fn collect_metrics_iteration(
-    metrics: &Metrics,
    client: &http::ClientWithMiddleware,
+    cached_metrics: &mut HashMap<Ids, (u64, DateTime<Utc>)>,
    metric_collection_endpoint: &reqwest::Url,
    hostname: &str,
-    prev: DateTime<Utc>,
-    now: DateTime<Utc>,
-) {
+) -> anyhow::Result<()> {
    info!(
        "starting collect_metrics_iteration. metric_collection_endpoint: {}",
        metric_collection_endpoint
    );

-    let mut metrics_to_clear = Vec::new();
+    let current_metrics = gather_proxy_io_bytes_per_client();

-    let metrics_to_send: Vec<(Ids, u64)> = metrics
-        .endpoints
+    let metrics_to_send: Vec<Event<Ids, &'static str>> = current_metrics
        .iter()
-        .filter_map(|counter| {
-            let key = counter.key().clone();
-            let Some(value) = counter.should_report() else {
-                metrics_to_clear.push(key);
-                return None;
+        .filter_map(|(curr_key, (curr_val, curr_time))| {
+            let mut start_time = *curr_time;
+            let mut value = *curr_val;
+
+            if let Some((prev_val, prev_time)) = cached_metrics.get(curr_key) {
+                // Only send metrics updates if the metric has increased
+                if curr_val > prev_val {
+                    value = curr_val - prev_val;
+                    start_time = *prev_time;
+                } else {
+                    if curr_val < prev_val {
+                        error!("proxy_io_bytes_per_client metric value decreased from {} to {} for key {:?}",
+                        prev_val, curr_val, curr_key);
+                    }
+                    return None;
+                }
            };
-            Some((key, value))
+
+            Some(Event {
+                kind: EventType::Incremental {
+                    start_time,
+                    stop_time: *curr_time,
+                },
+                metric: PROXY_IO_BYTES_PER_CLIENT,
+                idempotency_key: idempotency_key(hostname),
+                value,
+                extra: Ids {
+                    endpoint_id: curr_key.endpoint_id.clone(),
+                    branch_id: curr_key.branch_id.clone(),
+                },
+            })
        })
        .collect();

    if metrics_to_send.is_empty() {
        trace!("no new metrics to send");
+        return Ok(());
    }

    // Send metrics.
    // Split into chunks of 1000 metrics to avoid exceeding the max request size
    for chunk in metrics_to_send.chunks(CHUNK_SIZE) {
-        let events = chunk
-            .iter()
-            .map(|(ids, value)| Event {
-                kind: EventType::Incremental {
-                    start_time: prev,
-                    stop_time: now,
-                },
-                metric: PROXY_IO_BYTES_PER_CLIENT,
-                idempotency_key: idempotency_key(hostname),
-                value: *value,
-                extra: Ids {
-                    endpoint_id: ids.endpoint_id.clone(),
-                    branch_id: ids.branch_id.clone(),
-                },
-            })
-            .collect();
-
        let res = client
            .post(metric_collection_endpoint.clone())
-            .json(&EventChunk { events })
+            .json(&EventChunk {
+                events: chunk.into(),
+            })
            .send()
            .await;

@@ -213,113 +183,34 @@ async fn collect_metrics_iteration(

        if !res.status().is_success() {
            error!("metrics endpoint refused the sent metrics: {:?}", res);
-            for metric in chunk.iter().filter(|(_, value)| *value > (1u64 << 40)) {
+            for metric in chunk.iter().filter(|metric| metric.value > (1u64 << 40)) {
                // Report if the metric value is suspiciously large
                error!("potentially abnormal metric value: {:?}", metric);
            }
        }
-    }
+        // update cached metrics after they were sent
+        // (to avoid sending the same metrics twice)
+        // see the relevant discussion on why to do so even if the status is not success:
+        // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956
+        for send_metric in chunk {
+            let stop_time = match send_metric.kind {
+                EventType::Incremental { stop_time, .. } => stop_time,
+                _ => unreachable!(),
+            };

-    for metric in metrics_to_clear {
-        match metrics.endpoints.entry(metric) {
-            Entry::Occupied(mut counter) => {
-                if counter.get_mut().should_clear() {
-                    counter.remove_entry();
-                }
-            }
-            Entry::Vacant(_) => {}
+            cached_metrics
+                .entry(Ids {
+                    endpoint_id: send_metric.extra.endpoint_id.clone(),
+                    branch_id: send_metric.extra.branch_id.clone(),
+                })
+                // update cached value (add delta) and time
+                .and_modify(|e| {
+                    e.0 = e.0.saturating_add(send_metric.value);
+                    e.1 = stop_time
+                })
+                // cache new metric
+                .or_insert((send_metric.value, stop_time));
        }
    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::{
-        net::TcpListener,
-        sync::{Arc, Mutex},
-    };
-
-    use anyhow::Error;
-    use chrono::Utc;
-    use consumption_metrics::{Event, EventChunk};
-    use hyper::{
-        service::{make_service_fn, service_fn},
-        Body, Response,
-    };
-    use url::Url;
-
-    use super::{collect_metrics_iteration, Ids, Metrics};
-    use crate::http;
-
-    #[tokio::test]
-    async fn metrics() {
-        let listener = TcpListener::bind("0.0.0.0:0").unwrap();
-
-        let reports = Arc::new(Mutex::new(vec![]));
-        let reports2 = reports.clone();
-
-        let server = hyper::server::Server::from_tcp(listener)
-            .unwrap()
-            .serve(make_service_fn(move |_| {
-                let reports = reports.clone();
-                async move {
-                    Ok::<_, Error>(service_fn(move |req| {
-                        let reports = reports.clone();
-                        async move {
-                            let bytes = hyper::body::to_bytes(req.into_body()).await?;
-                            let events: EventChunk<'static, Event<Ids, String>> =
-                                serde_json::from_slice(&bytes)?;
-                            reports.lock().unwrap().push(events);
-                            Ok::<_, Error>(Response::new(Body::from(vec![])))
-                        }
-                    }))
-                }
-            }));
-        let addr = server.local_addr();
-        tokio::spawn(server);
-
-        let metrics = Metrics::default();
-        let client = http::new_client();
-        let endpoint = Url::parse(&format!("http://{addr}")).unwrap();
-        let now = Utc::now();
-
-        // no counters have been registered
-        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
-        let r = std::mem::take(&mut *reports2.lock().unwrap());
-        assert!(r.is_empty());
-
-        // register a new counter
-        let counter = metrics.register(Ids {
-            endpoint_id: "e1".to_string(),
-            branch_id: "b1".to_string(),
-        });
-
-        // the counter should be observed despite 0 egress
-        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
-        let r = std::mem::take(&mut *reports2.lock().unwrap());
-        assert_eq!(r.len(), 1);
-        assert_eq!(r[0].events.len(), 1);
-        assert_eq!(r[0].events[0].value, 0);
-
-        // record egress
-        counter.record_egress(1);
-
-        // egress should be observered
-        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
-        let r = std::mem::take(&mut *reports2.lock().unwrap());
-        assert_eq!(r.len(), 1);
-        assert_eq!(r[0].events.len(), 1);
-        assert_eq!(r[0].events[0].value, 1);
-
-        // release counter
-        drop(counter);
-
-        // we do not observe the counter
-        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
-        let r = std::mem::take(&mut *reports2.lock().unwrap());
-        assert!(r.is_empty());
-
-        // counter is unregistered
-        assert!(metrics.endpoints.is_empty());
-    }
+    Ok(())
 }
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -7,7 +7,6 @@ use crate::{
    compute::{self, PostgresConnection},
    config::{ProxyConfig, TlsConfig},
    console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
-    metrics::{Ids, USAGE_METRICS},
    protocol2::WithClientIp,
    stream::{PqStream, Stream},
 };
@@ -603,11 +602,6 @@ pub async fn proxy_pass(
    compute: impl AsyncRead + AsyncWrite + Unpin,
    aux: &MetricsAuxInfo,
 ) -> anyhow::Result<()> {
-    let usage = USAGE_METRICS.register(Ids {
-        endpoint_id: aux.endpoint_id.to_string(),
-        branch_id: aux.branch_id.to_string(),
-    });
-
    let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&aux.traffic_labels("tx"));
    let mut client = MeasuredStream::new(
        client,
@@ -615,7 +609,6 @@ pub async fn proxy_pass(
        |cnt| {
            // Number of bytes we sent to the client (outbound).
            m_sent.inc_by(cnt as u64);
-            usage.record_egress(cnt as u64);
        },
    );

@@ -697,14 +690,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
            .await
        {
            Ok(auth_result) => auth_result,
-            Err(e) => {
-                let user = creds.get_user();
-                let db = params.get("database");
-                let app = params.get("application_name");
-                let params_span = tracing::info_span!("", ?user, ?db, ?app);
-
-                return stream.throw_error(e).instrument(params_span).await;
-            }
+            Err(e) => return stream.throw_error(e).await,
        };

        let AuthSuccess {
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -105,8 +105,6 @@ class NeonCompare(PgCompare):
        self._pg_bin = pg_bin
        self.pageserver_http_client = self.env.pageserver.http_client()

-        # note that neon_simple_env now uses LOCAL_FS remote storage
-
        # Create tenant
        tenant_conf: Dict[str, str] = {}
        if False:  # TODO add pytest setting for this
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -460,11 +460,9 @@ class NeonEnvBuilder:
        ), "Unexpectedly instantiated from outside a test function"
        self.test_name = test_name

-    def init_configs(self, default_remote_storage_if_missing: bool = True) -> NeonEnv:
+    def init_configs(self) -> NeonEnv:
        # Cannot create more than one environment from one builder
        assert self.env is None, "environment already initialized"
-        if default_remote_storage_if_missing and self.pageserver_remote_storage is None:
-            self.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
        self.env = NeonEnv(self)
        return self.env

@@ -472,19 +470,8 @@ class NeonEnvBuilder:
        assert self.env is not None, "environment is not already initialized, call init() first"
        self.env.start()

-    def init_start(
-        self,
-        initial_tenant_conf: Optional[Dict[str, str]] = None,
-        default_remote_storage_if_missing: bool = True,
-    ) -> NeonEnv:
-        """
-        Default way to create and start NeonEnv. Also creates the initial_tenant with root initial_timeline.
-
-        To avoid creating initial_tenant, call init_configs to setup the environment.
-
-        Configuring pageserver with remote storage is now the default. There will be a warning if pageserver is created without one.
-        """
-        env = self.init_configs(default_remote_storage_if_missing=default_remote_storage_if_missing)
+    def init_start(self, initial_tenant_conf: Optional[Dict[str, str]] = None) -> NeonEnv:
+        env = self.init_configs()
        self.start()

        # Prepare the default branch to start the postgres on later.
@@ -559,7 +546,7 @@ class NeonEnvBuilder:
        user: RemoteStorageUser,
        bucket_name: Optional[str] = None,
        bucket_region: Optional[str] = None,
-    ) -> RemoteStorage:
+    ) -> Optional[RemoteStorage]:
        ret = kind.configure(
            self.repo_dir,
            self.mock_s3_server,
@@ -902,8 +889,6 @@ def _shared_simple_env(
    """
    # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
     is set, this is shared by all tests using `neon_simple_env`.
-
-    This fixture will use RemoteStorageKind.LOCAL_FS with pageserver.
    """

    if os.environ.get("TEST_SHARED_FIXTURES") is None:
@@ -1496,16 +1481,6 @@ class NeonAttachmentService:
            self.running = False
        return self

-    def attach_hook(self, tenant_id: TenantId, pageserver_id: int) -> int:
-        response = requests.post(
-            f"{self.env.control_plane_api}/attach_hook",
-            json={"tenant_id": str(tenant_id), "pageserver_id": pageserver_id},
-        )
-        response.raise_for_status()
-        gen = response.json()["gen"]
-        assert isinstance(gen, int)
-        return gen
-
    def __enter__(self) -> "NeonAttachmentService":
        return self

@@ -1714,7 +1689,12 @@ class NeonPageserver(PgProtocol):
        to call into the pageserver HTTP client.
        """
        if self.env.attachment_service is not None:
-            generation = self.env.attachment_service.attach_hook(tenant_id, self.id)
+            response = requests.post(
+                f"{self.env.control_plane_api}/attach_hook",
+                json={"tenant_id": str(tenant_id), "pageserver_id": self.id},
+            )
+            response.raise_for_status()
+            generation = response.json()["gen"]
        else:
            generation = None

--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -620,8 +620,3 @@ class PageserverHttpClient(requests.Session):
            },
        )
        self.verbose_error(res)
-
-    def deletion_queue_flush(self, execute: bool = False):
-        self.put(
-            f"http://localhost:{self.port}/v1/deletion_queue/flush?execute={'true' if execute else 'false'}"
-        ).raise_for_status()
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -267,7 +267,7 @@ def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional


 def list_prefix(
-    neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None, delimiter: str = "/"
+    neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None
 ) -> ListObjectsV2OutputTypeDef:
    """
    Note that this function takes into account prefix_in_bucket.
@@ -287,7 +287,7 @@ def list_prefix(

    # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
    response = remote.client.list_objects_v2(
-        Delimiter=delimiter,
+        Delimiter="/",
        Bucket=remote.bucket_name,
        Prefix=prefix,
    )
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -202,6 +202,9 @@ class RemoteStorageKind(str, enum.Enum):
    LOCAL_FS = "local_fs"
    MOCK_S3 = "mock_s3"
    REAL_S3 = "real_s3"
+    # Pass to tests that are generic to remote storage
+    # to ensure the test pass with or without the remote storage
+    NOOP = "noop"

    def configure(
        self,
@@ -212,7 +215,10 @@ class RemoteStorageKind(str, enum.Enum):
        user: RemoteStorageUser,
        bucket_name: Optional[str] = None,
        bucket_region: Optional[str] = None,
-    ) -> RemoteStorage:
+    ) -> Optional[RemoteStorage]:
+        if self == RemoteStorageKind.NOOP:
+            return None
+
        if self == RemoteStorageKind.LOCAL_FS:
            return LocalFsStorage(LocalFsStorage.component_path(repo_dir, user))

--- a/test_runner/performance/test_pageserver_startup_many_tenants.py
+++ b/test_runner/performance/test_pageserver_startup_many_tenants.py
@@ -1,52 +0,0 @@
-import queue
-import threading
-from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
-from fixtures.types import TenantId
-
-"""
-553  sudo mkfs.ext4 /dev/nvme1n1
-555  mkdir test_output
-556  sudo mount /dev/nvme1n1 test_output
-557  htop
-559  ./scripts/pysync
-560  NEON_BIN=/home/admin/neon/target/release DEFAULT_PG_VERSION=15 ./scripts/pytest --preserve-database-files --timeout=0 ./test_runner/performance/test_pageserver_startup_many_tenants.py
-561  sudo chown -R admin:admin test_output
-
-cargo build_testing --release
-
-562  NEON_BIN=$PWD/target/release DEFAULT_PG_VERSION=15 ./scripts/pytest --preserve-database-files --timeout=0 ./test_runner/performance/test_pageserver_startup_many_tenants.py
-
-cd test_output/test_pageserver_startup_many_tenants/repo
-
-sudo env  NEON_REPO_DIR=$PWD prlimit --nofile=300000:300000  ../../../target/release/neon_local start
-# watch initial load complete, then background jobs start. That's the interesting part.
-sudo env  NEON_REPO_DIR=$PWD prlimit --nofile=300000:300000  ../../../target/release/neon_local stop
-# usually pageserver won't be responsive, kill with
-sudo pkill -9 pageserver
-"""
-def test_pageserver_startup_many_tenants(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
-    env = neon_env_builder.init_start()
-
-    #  below doesn't work because summaries contain tenant and timeline ids and we check for them
-
-    tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
-    pshttp = env.pageserver.http_client()
-    ep = env.endpoints.create_start("main")
-    ep.safe_psql("create table foo(b text)")
-    for i in range(0, 8):
-        ep.safe_psql("insert into foo(b) values ('some text')")
-        # pg_bin.run_capture(["pgbench", "-i", "-s1", ep.connstr()])
-        wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
-        pshttp.timeline_checkpoint(tenant_id, timeline_id)
-    ep.stop_and_destroy()
-
-    env.pageserver.stop()
-    for sk in env.safekeepers:
-        sk.stop()
-
-    tenant_dir = env.repo_dir / "pageserver_1" / "tenants" / str(env.initial_tenant)
-
-    for i in range(0, 20_000):
-        import shutil
-
-        shutil.copytree(tenant_dir, tenant_dir.parent / str(TenantId.generate()))
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -4,12 +4,7 @@ from typing import List, Tuple

 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    Endpoint,
-    NeonEnv,
-    NeonEnvBuilder,
-    wait_for_last_flush_lsn,
-)
+from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder
 from fixtures.types import TenantId, TimelineId


@@ -31,18 +26,17 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):

    tenant_timelines: List[Tuple[TenantId, TimelineId, Endpoint]] = []

-    for _ in range(3):
+    for _ in range(4):
        tenant_id, timeline_id = env.neon_cli.create_tenant()

        endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
        with endpoint.cursor() as cur:
            cur.execute("CREATE TABLE t(key int primary key, value text)")
            cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'")
-            wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
        endpoint.stop()
        tenant_timelines.append((tenant_id, timeline_id, endpoint))

-    # Stop the pageserver -- this has to be not immediate or we need to wait for uploads
+    # Stop the pageserver
    env.pageserver.stop()

    # Leave the first timeline alone, but corrupt the others in different ways
@@ -51,21 +45,30 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):

    (tenant1, timeline1, pg1) = tenant_timelines[1]
    metadata_path = f"{env.pageserver.workdir}/tenants/{tenant1}/timelines/{timeline1}/metadata"
-    with open(metadata_path, "w") as f:
-        f.write("overwritten with garbage!")
+    f = open(metadata_path, "w")
+    f.write("overwritten with garbage!")
+    f.close()
    log.info(f"Timeline {tenant1}/{timeline1} got its metadata spoiled")

    (tenant2, timeline2, pg2) = tenant_timelines[2]
    timeline_path = f"{env.pageserver.workdir}/tenants/{tenant2}/timelines/{timeline2}/"
+    for filename in os.listdir(timeline_path):
+        if filename.startswith("00000"):
+            # Looks like a layer file. Remove it
+            os.remove(f"{timeline_path}/{filename}")
+    log.info(
+        f"Timeline {tenant2}/{timeline2} got its layer files removed (no remote storage enabled)"
+    )
+
+    (tenant3, timeline3, pg3) = tenant_timelines[3]
+    timeline_path = f"{env.pageserver.workdir}/tenants/{tenant3}/timelines/{timeline3}/"
    for filename in os.listdir(timeline_path):
        if filename.startswith("00000"):
            # Looks like a layer file. Corrupt it
-            p = f"{timeline_path}/{filename}"
-            size = os.path.getsize(p)
-            with open(p, "wb") as f:
-                f.truncate(0)
-                f.truncate(size)
-    log.info(f"Timeline {tenant2}/{timeline2} got its local layer files spoiled")
+            f = open(f"{timeline_path}/{filename}", "w")
+            f.write("overwritten with garbage!")
+            f.close()
+    log.info(f"Timeline {tenant3}/{timeline3} got its layer files spoiled")

    env.pageserver.start()

@@ -84,13 +87,22 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
        f"As expected, compute startup failed eagerly for timeline with corrupt metadata: {err}"
    )

-    # Second timeline will fail during basebackup, because the local layer file is corrupt.
+    # Second timeline has no ancestors, only the metadata file and no layer files locally,
+    # and we don't have the remote storage enabled. It is loaded into memory, but getting
+    # the basebackup from it will fail.
+    with pytest.raises(
+        Exception, match=f"Tenant {tenant2} will not become active. Current state: Broken"
+    ) as err:
+        pg2.start()
+    log.info(f"As expected, compute startup failed for timeline with missing layers: {err}")
+
+    # Third timeline will also fail during basebackup, because the layer file is corrupt.
    # It will fail when we try to read (and reconstruct) a page from it, ergo the error message.
    # (We don't check layer file contents on startup, when loading the timeline)
    with pytest.raises(Exception, match="Failed to load delta layer") as err:
-        pg2.start()
+        pg3.start()
    log.info(
-        f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"
+        f"As expected, compute startup failed for timeline {tenant3}/{timeline3} with corrupt layers: {err}"
    )


--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -211,12 +211,4 @@ def test_ddl_forwarding(ddl: DdlForwardingContext):
        ddl.wait()

    ddl.failures(False)
-    cur.execute("CREATE DATABASE failure WITH OWNER=cork")
-    ddl.wait()
-    with pytest.raises(psycopg2.InternalError):
-        ddl.failures(True)
-        cur.execute("DROP DATABASE failure")
-        ddl.wait()
-    ddl.pg.connect(dbname="failure")  # Ensure we can connect after a failed drop
-
    conn.close()
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -74,13 +74,11 @@ class EvictionEnv:
    pgbench_init_lsns: Dict[TenantId, Lsn]

    def timelines_du(self) -> Tuple[int, int, int]:
-        return poor_mans_du(
-            self.neon_env, [(tid, tlid) for tid, tlid in self.timelines], verbose=False
-        )
+        return poor_mans_du(self.neon_env, [(tid, tlid) for tid, tlid in self.timelines])

    def du_by_timeline(self) -> Dict[Tuple[TenantId, TimelineId], int]:
        return {
-            (tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)], verbose=True)[0]
+            (tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)])[0]
            for tid, tlid in self.timelines
        }

@@ -91,21 +89,7 @@ class EvictionEnv:
        """
        lsn = self.pgbench_init_lsns[tenant_id]
        with self.neon_env.endpoints.create_start("main", tenant_id=tenant_id, lsn=lsn) as endpoint:
-            # instead of using pgbench --select-only which does point selects,
-            # run full table scans for all tables
-            with endpoint.connect() as conn:
-                cur = conn.cursor()
-
-                tables_cols = {
-                    "pgbench_accounts": "abalance",
-                    "pgbench_tellers": "tbalance",
-                    "pgbench_branches": "bbalance",
-                    "pgbench_history": "delta",
-                }
-
-                for table, column in tables_cols.items():
-                    cur.execute(f"select avg({column}) from {table}")
-                    _avg = cur.fetchone()
+            self.pg_bin.run(["pgbench", "-S", endpoint.connstr()])

    def pageserver_start_with_disk_usage_eviction(
        self, period, max_usage_pct, min_avail_bytes, mock_behavior
@@ -143,19 +127,6 @@ class EvictionEnv:
        self.neon_env.pageserver.allowed_errors.append(".*WARN.* disk usage still high.*")


-def human_bytes(amt: float) -> str:
-    suffixes = ["", "Ki", "Mi", "Gi"]
-
-    last = suffixes[-1]
-
-    for name in suffixes:
-        if amt < 1024 or name == last:
-            return f"{int(round(amt))} {name}B"
-        amt = amt / 1024
-
-    raise RuntimeError("unreachable")
-
-
@pytest.fixture
 def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv:
    """
@@ -244,12 +215,8 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):

    healthy_tenant_id, healthy_timeline_id = env.timelines[1]

-    broken_size_pre, _, _ = poor_mans_du(
-        env.neon_env, [(broken_tenant_id, broken_timeline_id)], verbose=True
-    )
-    healthy_size_pre, _, _ = poor_mans_du(
-        env.neon_env, [(healthy_tenant_id, healthy_timeline_id)], verbose=True
-    )
+    broken_size_pre, _, _ = poor_mans_du(env.neon_env, [(broken_tenant_id, broken_timeline_id)])
+    healthy_size_pre, _, _ = poor_mans_du(env.neon_env, [(healthy_tenant_id, healthy_timeline_id)])

    # try to evict everything, then validate that broken tenant wasn't touched
    target = broken_size_pre + healthy_size_pre
@@ -257,12 +224,8 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
    response = env.pageserver_http.disk_usage_eviction_run({"evict_bytes": target})
    log.info(f"{response}")

-    broken_size_post, _, _ = poor_mans_du(
-        env.neon_env, [(broken_tenant_id, broken_timeline_id)], verbose=True
-    )
-    healthy_size_post, _, _ = poor_mans_du(
-        env.neon_env, [(healthy_tenant_id, healthy_timeline_id)], verbose=True
-    )
+    broken_size_post, _, _ = poor_mans_du(env.neon_env, [(broken_tenant_id, broken_timeline_id)])
+    healthy_size_post, _, _ = poor_mans_du(env.neon_env, [(healthy_tenant_id, healthy_timeline_id)])

    assert broken_size_pre == broken_size_post, "broken tenant should not be touched"
    assert healthy_size_post < healthy_size_pre
@@ -403,16 +366,18 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
    du_by_timeline = env.du_by_timeline()

    # pick any tenant
-    [warm, cold] = list(du_by_timeline.keys())
-    (tenant_id, timeline_id) = warm
+    [our_tenant, other_tenant] = list(du_by_timeline.keys())
+    (tenant_id, timeline_id) = our_tenant

-    # make picked tenant more recently used than the other one
+    # make our tenant more recently used than the other one
    env.warm_up_tenant(tenant_id)

    # Build up enough pressure to require evictions from both tenants,
    # but not enough to fall into global LRU.
-    # So, set target to all occupied space, except 2*env.layer_size per tenant
-    target = du_by_timeline[cold] + (du_by_timeline[warm] // 2) - 2 * 2 * env.layer_size
+    # So, set target to all occipied space, except 2*env.layer_size per tenant
+    target = (
+        du_by_timeline[other_tenant] + (du_by_timeline[our_tenant] // 2) - 2 * 2 * env.layer_size
+    )
    response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
    log.info(f"{response}")

@@ -427,33 +392,22 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
            later_tenant_usage < du_by_timeline[tenant]
        ), "all tenants should have lost some layers"

-    warm_size = later_du_by_timeline[warm]
-
-    # bounds for warmed_size
-    warm_lower = 0.5 * du_by_timeline[warm]
-
-    # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
-    # So, check for up to 3 here.
-    warm_upper = warm_lower + 3 * env.layer_size
-
-    cold_size = later_du_by_timeline[cold]
-    cold_upper = 2 * env.layer_size
-
-    log.info(
-        f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}"
-    )
-    log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}")
-
-    assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)"
-    assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)"
-
    assert (
-        cold_size < cold_upper
-    ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size"
+        later_du_by_timeline[our_tenant] > 0.5 * du_by_timeline[our_tenant]
+    ), "our warmed up tenant should be at about half capacity, part 1"
+    assert (
+        # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
+        # So, check for up to 3 here.
+        later_du_by_timeline[our_tenant]
+        < 0.5 * du_by_timeline[our_tenant] + 3 * env.layer_size
+    ), "our warmed up tenant should be at about half capacity, part 2"
+    assert (
+        later_du_by_timeline[other_tenant] < 2 * env.layer_size
+    ), "the other tenant should be evicted to is min_resident_size, i.e., max layer file size"


 def poor_mans_du(
-    env: NeonEnv, timelines: list[Tuple[TenantId, TimelineId]], verbose: bool = False
+    env: NeonEnv, timelines: list[Tuple[TenantId, TimelineId]]
 ) -> Tuple[int, int, int]:
    """
    Disk usage, largest, smallest layer for layer files over the given (tenant, timeline) tuples;
@@ -476,11 +430,9 @@ def poor_mans_du(
                smallest_layer = min(smallest_layer, size)
            else:
                smallest_layer = size
-            if verbose:
-                log.info(f"{tenant_id}/{timeline_id} => {file.name} {size} ({human_bytes(size)})")
+            log.info(f"{tenant_id}/{timeline_id} => {file.name} {size}")

-        if verbose:
-            log.info(f"{tenant_id}/{timeline_id}: sum {total} ({human_bytes(total)})")
+        log.info(f"{tenant_id}/{timeline_id}: sum {total}")
        total_on_disk += total

    assert smallest_layer is not None or total_on_disk == 0 and largest_layer == 0
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -1,352 +0,0 @@
-"""
-
-Tests in this module exercise the pageserver's behavior around generation numbers,
-as defined in docs/rfcs/025-generation-numbers.md.  Briefly, the behaviors we require
-of the pageserver are:
- Do not start a tenant without a generation number if control_plane_api is set
- Remote objects must be suffixed with generation
- Deletions may only be executed after validating generation
- Updates to remote_consistent_lsn may only be made visible after validating generation
-"""
-
-
-import re
-import time
-from typing import Optional
-
-import pytest
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    NeonEnv,
-    NeonEnvBuilder,
-    PgBin,
-    last_flush_lsn_upload,
-    wait_for_last_flush_lsn,
-)
-from fixtures.pageserver.utils import list_prefix
-from fixtures.remote_storage import (
-    RemoteStorageKind,
-)
-from fixtures.types import TenantId, TimelineId
-from fixtures.utils import print_gc_result, wait_until
-
-# A tenant configuration that is convenient for generating uploads and deletions
-# without a large amount of postgres traffic.
-TENANT_CONF = {
-    # small checkpointing and compaction targets to ensure we generate many upload operations
-    "checkpoint_distance": f"{128 * 1024}",
-    "compaction_threshold": "1",
-    "compaction_target_size": f"{128 * 1024}",
-    # no PITR horizon, we specify the horizon when we request on-demand GC
-    "pitr_interval": "0s",
-    # disable background compaction and GC. We invoke it manually when we want it to happen.
-    "gc_period": "0s",
-    "compaction_period": "0s",
-    # create image layers eagerly, so that GC can remove some layers
-    "image_creation_threshold": "1",
-}
-
-
-def generate_uploads_and_deletions(
-    env: NeonEnv,
-    *,
-    init: bool = True,
-    tenant_id: Optional[TenantId] = None,
-    timeline_id: Optional[TimelineId] = None,
-    data: Optional[str] = None,
-):
-    """
-    Using the environment's default tenant + timeline, generate a load pattern
-    that results in some uploads and some deletions to remote storage.
-    """
-
-    if tenant_id is None:
-        tenant_id = env.initial_tenant
-    assert tenant_id is not None
-
-    if timeline_id is None:
-        timeline_id = env.initial_timeline
-    assert timeline_id is not None
-
-    ps_http = env.pageserver.http_client()
-
-    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
-        if init:
-            endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
-            last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
-
-        def churn(data):
-            endpoint.safe_psql_many(
-                [
-                    f"""
-                INSERT INTO foo (id, val)
-                SELECT g, '{data}'
-                FROM generate_series(1, 20000) g
-                ON CONFLICT (id) DO UPDATE
-                SET val = EXCLUDED.val
-                """,
-                    # to ensure that GC can actually remove some layers
-                    "VACUUM foo",
-                ]
-            )
-            assert tenant_id is not None
-            assert timeline_id is not None
-            wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
-            ps_http.timeline_checkpoint(tenant_id, timeline_id)
-
-        # Compaction should generate some GC-elegible layers
-        for i in range(0, 2):
-            churn(f"{i if data is None else data}")
-
-        gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0)
-        print_gc_result(gc_result)
-        assert gc_result["layers_removed"] > 0
-
-
-def get_metric_or_0(ps_http, metric: str) -> int:
-    v = ps_http.get_metric_value(metric)
-    return 0 if v is None else int(v)
-
-
-def get_deletion_queue_executed(ps_http) -> int:
-    return get_metric_or_0(ps_http, "pageserver_deletion_queue_executed_total")
-
-
-def get_deletion_queue_submitted(ps_http) -> int:
-    return get_metric_or_0(ps_http, "pageserver_deletion_queue_submitted_total")
-
-
-def get_deletion_queue_dropped(ps_http) -> int:
-    return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_total")
-
-
-def get_deletion_queue_unexpected_errors(ps_http) -> int:
-    return get_metric_or_0(ps_http, "pageserver_deletion_queue_unexpected_errors_total")
-
-
-def get_deletion_queue_dropped_lsn_updates(ps_http) -> int:
-    return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_lsn_updates_total")
-
-
-def get_deletion_queue_depth(ps_http) -> int:
-    """
-    Queue depth if at least one deletion has been submitted, else None
-    """
-    submitted = get_deletion_queue_submitted(ps_http)
-    executed = get_deletion_queue_executed(ps_http)
-    dropped = get_deletion_queue_dropped(ps_http)
-    depth = submitted - executed - dropped
-    log.info(f"get_deletion_queue_depth: {depth} ({submitted} - {executed} - {dropped})")
-
-    assert depth >= 0
-    return int(depth)
-
-
-def assert_deletion_queue(ps_http, size_fn) -> None:
-    v = get_deletion_queue_depth(ps_http)
-    assert v is not None
-    assert size_fn(v) is True
-
-
-def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
-    """
-    Validate behavior when a pageserver is run without generation support enabled,
-    then started again after activating it:
-    - Before upgrade, no objects should have generation suffixes
-    - After upgrade, the bucket should contain a mixture.
-    - In both cases, postgres I/O should work.
-    """
-    neon_env_builder.enable_generations = True
-    neon_env_builder.enable_pageserver_remote_storage(
-        RemoteStorageKind.MOCK_S3,
-    )
-
-    env = neon_env_builder.init_configs()
-    env.broker.try_start()
-    for sk in env.safekeepers:
-        sk.start()
-    assert env.attachment_service is not None
-    env.attachment_service.start()
-
-    env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
-
-    env.neon_cli.create_tenant(
-        tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline
-    )
-    generate_uploads_and_deletions(env)
-
-    def parse_generation_suffix(key):
-        m = re.match(".+-([0-9a-zA-Z]{8})$", key)
-        if m is None:
-            return None
-        else:
-            log.info(f"match: {m}")
-            log.info(f"group: {m.group(1)}")
-            return int(m.group(1), 16)
-
-    pre_upgrade_keys = list(
-        [o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]]
-    )
-    for key in pre_upgrade_keys:
-        assert parse_generation_suffix(key) is None
-
-    env.pageserver.stop()
-
-    # Starting without the override that disabled control_plane_api
-    env.pageserver.start()
-
-    generate_uploads_and_deletions(env, init=False)
-
-    legacy_objects: list[str] = []
-    suffixed_objects = []
-    post_upgrade_keys = list(
-        [o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]]
-    )
-    for key in post_upgrade_keys:
-        log.info(f"post-upgrade key: {key}")
-        if parse_generation_suffix(key) is not None:
-            suffixed_objects.append(key)
-        else:
-            legacy_objects.append(key)
-
-    # Bucket now contains a mixture of suffixed and non-suffixed objects
-    assert len(suffixed_objects) > 0
-    assert len(legacy_objects) > 0
-
-    assert get_deletion_queue_unexpected_errors(env.pageserver.http_client()) == 0
-
-
-def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.enable_generations = True
-    neon_env_builder.enable_pageserver_remote_storage(
-        RemoteStorageKind.MOCK_S3,
-    )
-    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
-    assert env.attachment_service is not None
-
-    some_other_pageserver = 1234
-    ps_http = env.pageserver.http_client()
-
-    generate_uploads_and_deletions(env)
-
-    # Flush: pending deletions should all complete
-    assert_deletion_queue(ps_http, lambda n: n > 0)
-    ps_http.deletion_queue_flush(execute=True)
-    assert_deletion_queue(ps_http, lambda n: n == 0)
-    assert get_deletion_queue_dropped(ps_http) == 0
-
-    # Our visible remote_consistent_lsn should match projected
-    timeline = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
-    assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"]
-    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
-
-    env.pageserver.allowed_errors.extend(
-        [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
-    )
-
-    # Now advance the generation in the control plane: subsequent validations
-    # from the running pageserver will fail.  No more deletions should happen.
-    env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver)
-    generate_uploads_and_deletions(env, init=False)
-
-    assert_deletion_queue(ps_http, lambda n: n > 0)
-    queue_depth_before = get_deletion_queue_depth(ps_http)
-    executed_before = get_deletion_queue_executed(ps_http)
-    ps_http.deletion_queue_flush(execute=True)
-
-    # Queue drains to zero because we dropped deletions
-    assert_deletion_queue(ps_http, lambda n: n == 0)
-    # The executed counter has not incremented
-    assert get_deletion_queue_executed(ps_http) == executed_before
-    # The dropped counter has incremented to consume all of the deletions that were previously enqueued
-    assert get_deletion_queue_dropped(ps_http) == queue_depth_before
-
-    # Flush to S3 and see that remote_consistent_lsn does not advance: it cannot
-    # because generation validation fails.
-    timeline = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
-    assert timeline["remote_consistent_lsn"] != timeline["remote_consistent_lsn_visible"]
-    assert get_deletion_queue_dropped_lsn_updates(ps_http) > 0
-
-    # TODO: list bucket and confirm all objects have a generation suffix.
-
-    assert get_deletion_queue_unexpected_errors(ps_http) == 0
-
-
-@pytest.mark.parametrize("keep_attachment", [True, False])
-def test_deletion_queue_recovery(
-    neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, keep_attachment: bool
-):
-    """
-    :param keep_attachment: If true, we re-attach after restart.  Else, we act as if some other
-    node took the attachment while we were restarting.
-    """
-    neon_env_builder.enable_generations = True
-    neon_env_builder.enable_pageserver_remote_storage(
-        RemoteStorageKind.MOCK_S3,
-    )
-    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
-
-    ps_http = env.pageserver.http_client()
-
-    # Prevent deletion lists from being executed, to build up some backlog of deletions
-    ps_http.configure_failpoints(
-        [
-            ("deletion-queue-before-execute", "return"),
-        ]
-    )
-
-    generate_uploads_and_deletions(env)
-
-    # There should be entries in the deletion queue
-    assert_deletion_queue(ps_http, lambda n: n > 0)
-    ps_http.deletion_queue_flush()
-    before_restart_depth = get_deletion_queue_depth(ps_http)
-
-    assert get_deletion_queue_unexpected_errors(ps_http) == 0
-    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
-
-    log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
-    env.pageserver.stop(immediate=True)
-
-    if not keep_attachment:
-        some_other_pageserver = 101010
-        assert env.attachment_service is not None
-        env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver)
-
-    env.pageserver.start()
-
-    def assert_deletions_submitted(n: int):
-        assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n
-
-    # After restart, issue a flush to kick the deletion frontend to do recovery.
-    # It should recover all the operations we submitted before the restart.
-    ps_http.deletion_queue_flush(execute=False)
-    wait_until(20, 0.25, lambda: assert_deletions_submitted(before_restart_depth))
-
-    # The queue should drain through completely if we flush it
-    ps_http.deletion_queue_flush(execute=True)
-    wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0))
-
-    if keep_attachment:
-        # If we kept the attachment, then our pre-restart deletions should have executed
-        # successfully
-        assert get_deletion_queue_executed(ps_http) == before_restart_depth
-    else:
-        # If we lost the attachment, we should have dropped our pre-restart deletions.
-        assert get_deletion_queue_dropped(ps_http) == before_restart_depth
-        env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
-
-    assert get_deletion_queue_unexpected_errors(ps_http) == 0
-    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
-
-    # Restart again
-    env.pageserver.stop(immediate=True)
-    env.pageserver.start()
-
-    # No deletion lists should be recovered: this demonstrates that deletion lists
-    # were cleaned up after being executed or dropped in the previous process lifetime.
-    time.sleep(1)
-    assert_deletion_queue(ps_http, lambda n: n == 0)
-
-    assert get_deletion_queue_unexpected_errors(ps_http) == 0
-    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -5,6 +5,7 @@ from pathlib import Path
 from queue import SimpleQueue
 from typing import Any, Dict, Set

+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
@@ -16,13 +17,15 @@ from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response

-# TODO: collect all of the env setup *AFTER* removal of RemoteStorageKind.NOOP
-

+@pytest.mark.parametrize(
+    "remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.LOCAL_FS]
+)
 def test_metric_collection(
    httpserver: HTTPServer,
    neon_env_builder: NeonEnvBuilder,
    httpserver_listen_address,
+    remote_storage_kind: RemoteStorageKind,
 ):
    (host, port) = httpserver_listen_address
    metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
@@ -52,7 +55,7 @@ def test_metric_collection(
        synthetic_size_calculation_interval="3s"
        """

-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)

    log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}")

@@ -65,14 +68,6 @@ def test_metric_collection(
    env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
    # httpserver is shut down before pageserver during passing run
    env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
-    # we have a fast rate of calculation, these can happen at shutdown
-    env.pageserver.allowed_errors.append(
-        ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*"
-    )
-    env.pageserver.allowed_errors.append(
-        ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes"
-    )
-
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline
    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
@@ -103,14 +98,17 @@ def test_metric_collection(
            total += sample[2]
        return int(total)

-    # upload some data to remote storage
-    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
-    pageserver_http = env.pageserver.http_client()
-    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-    pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
+    remote_uploaded = 0

-    remote_uploaded = get_num_remote_ops("index", "upload")
-    assert remote_uploaded > 0
+    # upload some data to remote storage
+    if remote_storage_kind == RemoteStorageKind.LOCAL_FS:
+        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+        pageserver_http = env.pageserver.http_client()
+        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+        pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
+
+        remote_uploaded = get_num_remote_ops("index", "upload")
+        assert remote_uploaded > 0

    # we expect uploads at 1Hz, on busy runners this could be too optimistic,
    # so give 5s we only want to get the following upload after "ready" value.
@@ -213,14 +211,6 @@ def test_metric_collection_cleans_up_tempfile(

    # httpserver is shut down before pageserver during passing run
    env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
-    # we have a fast rate of calculation, these can happen at shutdown
-    env.pageserver.allowed_errors.append(
-        ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*"
-    )
-    env.pageserver.allowed_errors.append(
-        ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes"
-    )
-
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline
    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -30,7 +30,9 @@ from fixtures.types import TenantId
 from fixtures.utils import run_pg_bench_small


-@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+@pytest.mark.parametrize(
+    "remote_storage_kind", [RemoteStorageKind.NOOP, *available_remote_storages()]
+)
 def test_tenant_delete_smoke(
    neon_env_builder: NeonEnvBuilder,
    remote_storage_kind: RemoteStorageKind,
@@ -41,12 +43,6 @@ def test_tenant_delete_smoke(
    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)

    env = neon_env_builder.init_start()
-    env.pageserver.allowed_errors.extend(
-        [
-            # The deletion queue will complain when it encounters simulated S3 errors
-            ".*deletion executor: DeleteObjects request failed.*",
-        ]
-    )

    # lucky race with stopping from flushing a layer we fail to schedule any uploads
    env.pageserver.allowed_errors.append(
@@ -142,12 +138,18 @@ FAILPOINTS_BEFORE_BACKGROUND = [
 def combinations():
    result = []

-    remotes = [RemoteStorageKind.MOCK_S3]
+    remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3]
    if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"):
        remotes.append(RemoteStorageKind.REAL_S3)

    for remote_storage_kind in remotes:
        for delete_failpoint in FAILPOINTS:
+            if remote_storage_kind is RemoteStorageKind.NOOP and delete_failpoint in (
+                "timeline-delete-before-index-delete",
+            ):
+                # the above failpoint are not relevant for config without remote storage
+                continue
+
            # Simulate failures for only one type of remote storage
            # to avoid log pollution and make tests run faster
            if remote_storage_kind is RemoteStorageKind.MOCK_S3:
@@ -193,32 +195,27 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
        ]
    )

-    if simulate_failures:
-        env.pageserver.allowed_errors.extend(
-            [
-                # The deletion queue will complain when it encounters simulated S3 errors
-                ".*deletion executor: DeleteObjects request failed.*",
-            ]
-        )
-
    ps_http = env.pageserver.http_client()

    timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id)
    with env.endpoints.create_start("delete", tenant_id=tenant_id) as endpoint:
        # generate enough layers
        run_pg_bench_small(pg_bin, endpoint.connstr())
-        last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+        if remote_storage_kind is RemoteStorageKind.NOOP:
+            wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+        else:
+            last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)

-        if remote_storage_kind in available_s3_storages():
-            assert_prefix_not_empty(
-                neon_env_builder,
-                prefix="/".join(
-                    (
-                        "tenants",
-                        str(tenant_id),
-                    )
-                ),
-            )
+            if remote_storage_kind in available_s3_storages():
+                assert_prefix_not_empty(
+                    neon_env_builder,
+                    prefix="/".join(
+                        (
+                            "tenants",
+                            str(tenant_id),
+                        )
+                    ),
+                )

    ps_http.configure_failpoints((failpoint, "return"))

@@ -249,7 +246,12 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
        env.pageserver.stop()
        env.pageserver.start()

-        if failpoint in (
+        if (
+            remote_storage_kind is RemoteStorageKind.NOOP
+            and failpoint == "tenant-delete-before-create-local-mark"
+        ):
+            tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
+        elif failpoint in (
            "tenant-delete-before-shutdown",
            "tenant-delete-before-create-remote-mark",
        ):
@@ -381,7 +383,6 @@ def test_tenant_delete_is_resumed_on_attach(
    assert not tenant_path.exists()

    if remote_storage_kind in available_s3_storages():
-        ps_http.deletion_queue_flush(execute=True)
        assert_prefix_empty(
            neon_env_builder,
            prefix="/".join(
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -519,8 +519,11 @@ def test_detach_while_attaching(
 # * restart the pageserver and verify that ignored tenant is still not loaded
 # * `load` the same tenant
 # * ensure that it's status is `Active` and it's present in pageserver's memory with all timelines
-def test_ignored_tenant_reattach(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3])
+def test_ignored_tenant_reattach(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()

--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -15,7 +15,7 @@ from fixtures.pageserver.utils import (
    timeline_delete_wait_completed,
    wait_until_tenant_active,
 )
-from fixtures.pg_version import PgVersion
+from fixtures.pg_version import PgVersion, xfail_on_postgres
 from fixtures.types import Lsn, TenantId, TimelineId


@@ -532,24 +532,7 @@ def test_single_branch_get_tenant_size_grows(
    assert size_after == prev, "size after restarting pageserver should not have changed"


-def assert_size_approx_equal(size_a, size_b):
-    """
-    Tests that evaluate sizes are checking the pageserver space consumption
-    that sits many layers below the user input.  The exact space needed
-    varies slightly depending on postgres behavior.
-
-    Rather than expecting postgres to be determinstic and occasionally
-    failing the test, we permit sizes for the same data to vary by a few pages.
-    """
-
-    # Determined empirically from examples of equality failures: they differ
-    # by page multiples of 8272, and usually by 1-3 pages.  Tolerate 4 to avoid
-    # failing on outliers from that observed range.
-    threshold = 4 * 8272
-
-    assert size_a == pytest.approx(size_b, abs=threshold)
-
-
+@xfail_on_postgres(PgVersion.V15, reason="Test significantly more flaky on Postgres 15")
 def test_get_tenant_size_with_multiple_branches(
    neon_env_builder: NeonEnvBuilder, test_output_dir: Path
 ):
@@ -590,7 +573,7 @@ def test_get_tenant_size_with_multiple_branches(
    )

    size_after_first_branch = http_client.tenant_size(tenant_id)
-    assert_size_approx_equal(size_after_first_branch, size_at_branch)
+    assert size_after_first_branch == size_at_branch

    first_branch_endpoint = env.endpoints.create_start("first-branch", tenant_id=tenant_id)

@@ -616,7 +599,7 @@ def test_get_tenant_size_with_multiple_branches(
        "second-branch", main_branch_name, tenant_id
    )
    size_after_second_branch = http_client.tenant_size(tenant_id)
-    assert_size_approx_equal(size_after_second_branch, size_after_continuing_on_main)
+    assert size_after_second_branch == size_after_continuing_on_main

    second_branch_endpoint = env.endpoints.create_start("second-branch", tenant_id=tenant_id)

@@ -652,7 +635,7 @@ def test_get_tenant_size_with_multiple_branches(
    # tenant_size but so far this has been reliable, even though at least gc
    # and tenant_size race for the same locks
    size_after = http_client.tenant_size(tenant_id)
-    assert_size_approx_equal(size_after, size_after_thinning_branch)
+    assert size_after == size_after_thinning_branch

    size_debug_file_before = open(test_output_dir / "size_debug_before.html", "w")
    size_debug = http_client.tenant_size_debug(tenant_id)
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -12,6 +12,7 @@ from fixtures.log_helper import log
 from fixtures.metrics import (
    PAGESERVER_GLOBAL_METRICS,
    PAGESERVER_PER_TENANT_METRICS,
+    PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
    parse_metrics,
 )
 from fixtures.neon_fixtures import (
@@ -231,10 +232,17 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
        assert value


-def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize(
+    "remote_storage_kind",
+    # exercise both the code paths where remote_storage=None and remote_storage=Some(...)
+    [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3],
+)
+def test_pageserver_metrics_removed_after_detach(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
    """Tests that when a tenant is detached, the tenant specific metrics are not left behind"""

-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)

    neon_env_builder.num_safekeepers = 3

@@ -274,6 +282,9 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde
    for tenant in [tenant_1, tenant_2]:
        pre_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)])
        expected = set(PAGESERVER_PER_TENANT_METRICS)
+        if remote_storage_kind == RemoteStorageKind.NOOP:
+            # if there's no remote storage configured, we don't expose the remote timeline client metrics
+            expected -= set(PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS)
        assert pre_detach_samples == expected

        env.pageserver.http_client().tenant_detach(tenant)
@@ -283,7 +294,9 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde


 # Check that empty tenants work with or without the remote storage
-@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+@pytest.mark.parametrize(
+    "remote_storage_kind", available_remote_storages() + [RemoteStorageKind.NOOP]
+)
 def test_pageserver_with_empty_tenants(
    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
 ):
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -12,6 +12,7 @@ from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    PgBin,
    last_flush_lsn_upload,
+    wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
@@ -144,12 +145,19 @@ DELETE_FAILPOINTS = [
 def combinations():
    result = []

-    remotes = [RemoteStorageKind.MOCK_S3]
+    remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3]
    if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"):
        remotes.append(RemoteStorageKind.REAL_S3)

    for remote_storage_kind in remotes:
        for delete_failpoint in DELETE_FAILPOINTS:
+            if remote_storage_kind == RemoteStorageKind.NOOP and delete_failpoint in (
+                "timeline-delete-before-index-delete",
+                "timeline-delete-after-index-delete",
+            ):
+                # the above failpoints are not relevant for config without remote storage
+                continue
+
            result.append((remote_storage_kind, delete_failpoint))
    return result

@@ -197,21 +205,23 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
    with env.endpoints.create_start("delete") as endpoint:
        # generate enough layers
        run_pg_bench_small(pg_bin, endpoint.connstr())
+        if remote_storage_kind is RemoteStorageKind.NOOP:
+            wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline_id)
+        else:
+            last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)

-        last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
-
-        if remote_storage_kind in available_s3_storages():
-            assert_prefix_not_empty(
-                neon_env_builder,
-                prefix="/".join(
-                    (
-                        "tenants",
-                        str(env.initial_tenant),
-                        "timelines",
-                        str(timeline_id),
-                    )
-                ),
-            )
+            if remote_storage_kind in available_s3_storages():
+                assert_prefix_not_empty(
+                    neon_env_builder,
+                    prefix="/".join(
+                        (
+                            "tenants",
+                            str(env.initial_tenant),
+                            "timelines",
+                            str(timeline_id),
+                        )
+                    ),
+                )

    env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}")
    # It appears when we stopped flush loop during deletion and then pageserver is stopped
@@ -797,8 +807,6 @@ def test_delete_orphaned_objects(
    reason = timeline_info["state"]["Broken"]["reason"]
    assert reason.endswith(f"failpoint: {failpoint}"), reason

-    ps_http.deletion_queue_flush(execute=True)
-
    for orphan in orphans:
        assert not orphan.exists()
        assert env.pageserver.log_contains(
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -301,8 +301,12 @@ def test_timeline_initial_logical_size_calculation_cancellation(
    # message emitted by the code behind failpoint "timeline-calculate-logical-size-check-dir-exists"


-def test_timeline_physical_size_init(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
+def test_timeline_physical_size_init(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
+):
+    if remote_storage_kind is not None:
+        neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)

    env = neon_env_builder.init_start()

@@ -333,12 +337,17 @@ def test_timeline_physical_size_init(neon_env_builder: NeonEnvBuilder):
    )

    assert_physical_size_invariants(
-        get_physical_size_values(env, env.initial_tenant, new_timeline_id),
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
+        remote_storage_kind,
    )


-def test_timeline_physical_size_post_checkpoint(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
+def test_timeline_physical_size_post_checkpoint(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
+):
+    if remote_storage_kind is not None:
+        neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)

    env = neon_env_builder.init_start()

@@ -360,14 +369,19 @@ def test_timeline_physical_size_post_checkpoint(neon_env_builder: NeonEnvBuilder

    def check():
        assert_physical_size_invariants(
-            get_physical_size_values(env, env.initial_tenant, new_timeline_id),
+            get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
+            remote_storage_kind,
        )

    wait_until(10, 1, check)


-def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
+def test_timeline_physical_size_post_compaction(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
+):
+    if remote_storage_kind is not None:
+        neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)

    # Disable background compaction as we don't want it to happen after `get_physical_size` request
    # and before checking the expected size on disk, which makes the assertion failed
@@ -406,15 +420,21 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder
    pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
    pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id)

-    wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)
+    if remote_storage_kind is not None:
+        wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)

    assert_physical_size_invariants(
-        get_physical_size_values(env, env.initial_tenant, new_timeline_id),
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
+        remote_storage_kind,
    )


-def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
+def test_timeline_physical_size_post_gc(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
+):
+    if remote_storage_kind is not None:
+        neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)

    # Disable background compaction and GC as we don't want it to happen after `get_physical_size` request
    # and before checking the expected size on disk, which makes the assertion failed
@@ -451,10 +471,12 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
    pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
    pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None)

-    wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)
+    if remote_storage_kind is not None:
+        wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)

    assert_physical_size_invariants(
-        get_physical_size_values(env, env.initial_tenant, new_timeline_id),
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
+        remote_storage_kind,
    )


@@ -538,10 +560,14 @@ def test_timeline_size_metrics(
    assert math.isclose(dbsize_sum, tl_logical_size_metric, abs_tol=2 * 1024 * 1024)


-def test_tenant_physical_size(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
+def test_tenant_physical_size(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
+):
    random.seed(100)

-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+    if remote_storage_kind is not None:
+        neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)

    env = neon_env_builder.init_start()

@@ -549,10 +575,12 @@ def test_tenant_physical_size(neon_env_builder: NeonEnvBuilder):
    client = env.pageserver.http_client()

    tenant, timeline = env.neon_cli.create_tenant()
+    if remote_storage_kind is not None:
+        wait_for_upload_queue_empty(pageserver_http, tenant, timeline)

    def get_timeline_resident_physical_size(timeline: TimelineId):
-        sizes = get_physical_size_values(env, tenant, timeline)
-        assert_physical_size_invariants(sizes)
+        sizes = get_physical_size_values(env, tenant, timeline, remote_storage_kind)
+        assert_physical_size_invariants(sizes, remote_storage_kind)
        return sizes.prometheus_resident_physical

    timeline_total_resident_physical_size = get_timeline_resident_physical_size(timeline)
@@ -572,7 +600,8 @@ def test_tenant_physical_size(neon_env_builder: NeonEnvBuilder):
        wait_for_last_flush_lsn(env, endpoint, tenant, timeline)
        pageserver_http.timeline_checkpoint(tenant, timeline)

-        wait_for_upload_queue_empty(pageserver_http, tenant, timeline)
+        if remote_storage_kind is not None:
+            wait_for_upload_queue_empty(pageserver_http, tenant, timeline)

        timeline_total_resident_physical_size += get_timeline_resident_physical_size(timeline)

@@ -601,6 +630,7 @@ def get_physical_size_values(
    env: NeonEnv,
    tenant_id: TenantId,
    timeline_id: TimelineId,
+    remote_storage_kind: Optional[RemoteStorageKind],
 ) -> TimelinePhysicalSizeValues:
    res = TimelinePhysicalSizeValues()

@@ -616,9 +646,12 @@ def get_physical_size_values(
    res.prometheus_resident_physical = metrics.query_one(
        "pageserver_resident_physical_size", metrics_filter
    ).value
-    res.prometheus_remote_physical = metrics.query_one(
-        "pageserver_remote_physical_size", metrics_filter
-    ).value
+    if remote_storage_kind is not None:
+        res.prometheus_remote_physical = metrics.query_one(
+            "pageserver_remote_physical_size", metrics_filter
+        ).value
+    else:
+        res.prometheus_remote_physical = None

    detail = client.timeline_detail(
        tenant_id, timeline_id, include_timeline_dir_layer_file_size_sum=True
@@ -631,15 +664,20 @@ def get_physical_size_values(
    return res


-def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
+def assert_physical_size_invariants(
+    sizes: TimelinePhysicalSizeValues, remote_storage_kind: Optional[RemoteStorageKind]
+):
    # resident phyiscal size is defined as
    assert sizes.python_timelinedir_layerfiles_physical == sizes.prometheus_resident_physical
    assert sizes.python_timelinedir_layerfiles_physical == sizes.layer_map_file_size_sum

    # we don't do layer eviction, so, all layers are resident
    assert sizes.api_current_physical == sizes.prometheus_resident_physical
-    assert sizes.prometheus_resident_physical == sizes.prometheus_remote_physical
-    # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS
+    if remote_storage_kind is not None:
+        assert sizes.prometheus_resident_physical == sizes.prometheus_remote_physical
+        # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS
+    else:
+        assert sizes.prometheus_remote_physical is None


 # Timeline logical size initialization is an asynchronous background task that runs once,
Author	SHA1	Message	Date
John Spray	2ce2574aa4	docs: pageserver controller rfc	2023-09-29 18:24:24 +01:00
John Spray	dc5f107170	docs: sharding phase 1 RFC	2023-09-29 18:20:13 +01:00
John Spray	1569446396	clarifications	2023-09-27 10:38:52 +01:00
John Spray	a8143a3bed	Align cutover downloads with heatmap	2023-09-27 10:38:52 +01:00
John Spray	689b6f14b7	Apply suggestions from code review Co-authored-by: Christian Schwarz <christian@neon.tech>	2023-09-27 10:38:41 +01:00
John Spray	9c1c06ad17	Bump number of migration RFC	2023-09-26 10:57:11 +01:00
John Spray	40d2a73a0c	Merge remote-tracking branch 'upstream/main' into jcsp/rfc-migration	2023-09-26 10:56:57 +01:00
John Spray	89ddefb428	Note safety requirement for AttachedMulti & out of scope item	2023-09-12 14:15:46 +01:00
John Spray	cad0799521	Mention disabling consumption metrics in AttachedStale	2023-09-12 14:04:01 +01:00
John Spray	1143e2e9ce	Clarifications	2023-09-12 12:13:29 +01:00
Christian Schwarz	ef3e75abc3	for #5029 (rfc tenant migrations): editorial fixes (#5185 )	2023-09-01 18:10:44 +01:00
John Spray	cfb285139c	docs/rfcs: add RFC for fast tenant migration/failover	2023-08-31 10:55:17 +01:00