From 16f06222228ffeaef9d3bbfe701035d2b78bf20d Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 26 Sep 2023 17:59:25 +0300
Subject: [PATCH 01/24] fix: real_s3 flakyness with rust tests (#5386)

Fixes #5072. See proof from
https://github.com/neondatabase/neon/issues/5072#issuecomment-1735580798.
Turns out multiple threads can get the same nanoseconds since epoch, so
switch to using millis (for finding the prefix later on) and randomness
via `thread_rng` (protect against adversial ci runners).

Also changes the "per test looking alike" prefix to more "general"
prefix.
---
 Cargo.lock                                |  1 +
 libs/remote_storage/Cargo.toml            |  1 +
 libs/remote_storage/tests/test_real_s3.rs | 15 ++++++++++++---
 3 files changed, 14 insertions(+), 3 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 2055f001af..55c80e30a7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3417,6 +3417,7 @@ dependencies = [
  "metrics",
  "once_cell",
  "pin-project-lite",
+ "rand",
  "scopeguard",
  "serde",
  "serde_json",
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index a4adae6146..2b808779f4 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -29,3 +29,4 @@ workspace_hack.workspace = true
 [dev-dependencies]
 tempfile.workspace = true
 test-context.workspace = true
+rand.workspace = true
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index 982c01a9be..b220349829 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -378,21 +378,30 @@ impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
 fn create_s3_client(
     max_keys_per_list_response: Option<i32>,
 ) -> anyhow::Result<Arc<GenericRemoteStorage>> {
+    use rand::Rng;
+
     let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
         .context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
     let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
         .context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?;
-    let random_prefix_part = std::time::SystemTime::now()
+
+    // due to how time works, we've had test runners use the same nanos as bucket prefixes.
+    // millis is just a debugging aid for easier finding the prefix later.
+    let millis = std::time::SystemTime::now()
         .duration_since(UNIX_EPOCH)
         .context("random s3 test prefix part calculation")?
-        .as_nanos();
+        .as_millis();
+
+    // because nanos can be the same for two threads so can millis, add randomness
+    let random = rand::thread_rng().gen::<u32>();
+
     let remote_storage_config = RemoteStorageConfig {
         max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
         max_sync_errors: NonZeroU32::new(5).unwrap(),
         storage: RemoteStorageKind::AwsS3(S3Config {
             bucket_name: remote_storage_s3_bucket,
             bucket_region: remote_storage_s3_region,
-            prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
+            prefix_in_bucket: Some(format!("test_{millis}_{random:08x}/")),
             endpoint: None,
             concurrency_limit: NonZeroUsize::new(100).unwrap(),
             max_keys_per_list_response,

From ba92668e3714b3fab25a7769cba1880dd888f3c1 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 26 Sep 2023 16:11:55 +0100
Subject: [PATCH 02/24] pageserver: deletion queue & generation validation for
 deletions (#5207)

## Problem

Pageservers must not delete objects or advertise updates to
remote_consistent_lsn without checking that they hold the latest
generation for the tenant in question (see [the RFC](
https://github.com/neondatabase/neon/blob/main/docs/rfcs/025-generation-numbers.md))

In this PR:
- A new "deletion queue" subsystem is introduced, through which
deletions flow
- `RemoteTimelineClient` is modified to send deletions through the
deletion queue:
- For GC & compaction, deletions flow through the full generation
verifying process
- For timeline deletions, deletions take a fast path that bypasses
generation verification
- The `last_uploaded_consistent_lsn` value in `UploadQueue` is replaced
with a mechanism that maintains a "projected" lsn (equivalent to the
previous property), and a "visible" LSN (which is the one that we may
share with safekeepers).
- Until `control_plane_api` is set, all deletions skip generation
validation
- Tests are introduced for the new functionality in
`test_pageserver_generations.py`

Once this lands, if a pageserver is configured with the
`control_plane_api` configuration added in
https://github.com/neondatabase/neon/pull/5163, it becomes safe to
attach a tenant to multiple pageservers concurrently.

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 control_plane/src/bin/attachment_service.rs   |    1 +
 libs/pageserver_api/src/models.rs             |    7 +
 libs/remote_storage/src/lib.rs                |   27 +
 libs/remote_storage/src/s3_bucket.rs          |    7 +-
 libs/utils/src/generation.rs                  |   16 +
 libs/utils/src/http/error.rs                  |    7 +
 pageserver/src/bin/pageserver.rs              |   29 +-
 pageserver/src/config.rs                      |   34 +-
 pageserver/src/control_plane_client.rs        |  170 ++-
 pageserver/src/deletion_queue.rs              | 1312 +++++++++++++++++
 pageserver/src/deletion_queue/deleter.rs      |  156 ++
 pageserver/src/deletion_queue/list_writer.rs  |  487 ++++++
 pageserver/src/deletion_queue/validator.rs    |  414 ++++++
 pageserver/src/http/openapi_spec.yml          |    3 +
 pageserver/src/http/routes.rs                 |   90 +-
 pageserver/src/lib.rs                         |   13 +-
 pageserver/src/metrics.rs                     |   51 +
 pageserver/src/repository.rs                  |    2 +-
 pageserver/src/task_mgr.rs                    |    2 +-
 pageserver/src/tenant.rs                      |   50 +-
 pageserver/src/tenant/mgr.rs                  |   68 +-
 .../src/tenant/remote_timeline_client.rs      |  232 +--
 .../tenant/remote_timeline_client/delete.rs   |   34 -
 .../tenant/remote_timeline_client/download.rs |    7 +-
 pageserver/src/tenant/timeline.rs             |   26 +-
 pageserver/src/tenant/timeline/delete.rs      |    7 +-
 .../walreceiver/walreceiver_connection.rs     |    5 +-
 pageserver/src/tenant/upload_queue.rs         |   47 +-
 test_runner/fixtures/neon_fixtures.py         |   17 +-
 test_runner/fixtures/pageserver/http.py       |    5 +
 test_runner/fixtures/pageserver/utils.py      |    4 +-
 .../regress/test_pageserver_generations.py    |  352 +++++
 test_runner/regress/test_tenant_delete.py     |   15 +
 test_runner/regress/test_timeline_delete.py   |    2 +
 34 files changed, 3388 insertions(+), 311 deletions(-)
 create mode 100644 pageserver/src/deletion_queue.rs
 create mode 100644 pageserver/src/deletion_queue/deleter.rs
 create mode 100644 pageserver/src/deletion_queue/list_writer.rs
 create mode 100644 pageserver/src/deletion_queue/validator.rs
 delete mode 100644 pageserver/src/tenant/remote_timeline_client/delete.rs
 create mode 100644 test_runner/regress/test_pageserver_generations.py

diff --git a/control_plane/src/bin/attachment_service.rs b/control_plane/src/bin/attachment_service.rs
index e879646b63..d4bca59c7b 100644
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -223,6 +223,7 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
     if attach_req.pageserver_id.is_some() {
         tenant_state.generation += 1;
     }
+    tenant_state.pageserver = attach_req.pageserver_id;
     let generation = tenant_state.generation;
 
     locked.save().await.map_err(ApiError::InternalServerError)?;
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index f354296be2..68620787bb 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -363,8 +363,15 @@ pub struct TimelineInfo {
     pub latest_gc_cutoff_lsn: Lsn,
     #[serde_as(as = "DisplayFromStr")]
     pub disk_consistent_lsn: Lsn,
+
+    /// The LSN that we have succesfully uploaded to remote storage
     #[serde_as(as = "DisplayFromStr")]
     pub remote_consistent_lsn: Lsn,
+
+    /// The LSN that we are advertizing to safekeepers
+    #[serde_as(as = "DisplayFromStr")]
+    pub remote_consistent_lsn_visible: Lsn,
+
     pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
     /// Sum of the size of all layer files.
     /// If a layer is present in both local FS and S3, it counts only once.
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 1ddd156a08..a92b87632b 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -20,6 +20,7 @@ use std::{
 
 use anyhow::{bail, Context};
 
+use serde::{Deserialize, Serialize};
 use tokio::io;
 use toml_edit::Item;
 use tracing::info;
@@ -42,6 +43,9 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
 /// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
 
+/// As defined in S3 docs
+pub const MAX_KEYS_PER_DELETE: usize = 1000;
+
 const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
 
 /// Path on the remote storage, relative to some inner prefix.
@@ -50,6 +54,25 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct RemotePath(PathBuf);
 
+impl Serialize for RemotePath {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        serializer.collect_str(self)
+    }
+}
+
+impl<'de> Deserialize<'de> for RemotePath {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        let str = String::deserialize(deserializer)?;
+        Ok(Self(PathBuf::from(&str)))
+    }
+}
+
 impl std::fmt::Display for RemotePath {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "{}", self.0.display())
@@ -88,6 +111,10 @@ impl RemotePath {
     pub fn extension(&self) -> Option<&str> {
         self.0.extension()?.to_str()
     }
+
+    pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Path, std::path::StripPrefixError> {
+        self.0.strip_prefix(&p.0)
+    }
 }
 
 /// Storage (potentially remote) API to manage its state.
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 9262f1e88f..acab953904 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -33,11 +33,10 @@ use tracing::debug;
 
 use super::StorageMetadata;
 use crate::{
-    Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Download, DownloadError, RemotePath, RemoteStorage, S3Config, MAX_KEYS_PER_DELETE,
+    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
-const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;
-
 pub(super) mod metrics;
 
 use self::metrics::{AttemptOutcome, RequestKind};
@@ -500,7 +499,7 @@ impl RemoteStorage for S3Bucket {
             delete_objects.push(obj_id);
         }
 
-        for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
+        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
             let started_at = start_measuring_requests(kind);
 
             let resp = self
diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs
index 163c8c0467..88d50905c6 100644
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -89,6 +89,22 @@ impl Generation {
             Self::Broken => panic!("Attempted to use a broken generation"),
         }
     }
+
+    pub fn next(&self) -> Generation {
+        match self {
+            Self::Valid(n) => Self::Valid(*n + 1),
+            Self::None => Self::Valid(1),
+            Self::Broken => panic!("Attempted to use a broken generation"),
+        }
+    }
+
+    pub fn into(self) -> Option<u32> {
+        if let Self::Valid(v) = self {
+            Some(v)
+        } else {
+            None
+        }
+    }
 }
 
 impl Serialize for Generation {
diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index 527e486fd0..dd54cd6ecd 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -24,6 +24,9 @@ pub enum ApiError {
     #[error("Precondition failed: {0}")]
     PreconditionFailed(Box<str>),
 
+    #[error("Shutting down")]
+    ShuttingDown,
+
     #[error(transparent)]
     InternalServerError(anyhow::Error),
 }
@@ -52,6 +55,10 @@ impl ApiError {
                 self.to_string(),
                 StatusCode::PRECONDITION_FAILED,
             ),
+            ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status(
+                "Shutting down".to_string(),
+                StatusCode::SERVICE_UNAVAILABLE,
+            ),
             ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
                 err.to_string(),
                 StatusCode::INTERNAL_SERVER_ERROR,
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index b6a2117f9c..90c7c11194 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -8,6 +8,7 @@ use anyhow::{anyhow, Context};
 use clap::{Arg, ArgAction, Command};
 
 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
+use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
@@ -20,6 +21,7 @@ use metrics::set_build_info_metric;
 use pageserver::{
     config::{defaults::*, PageServerConf},
     context::{DownloadBehavior, RequestContext},
+    deletion_queue::DeletionQueue,
     http, page_cache, page_service, task_mgr,
     task_mgr::TaskKind,
     task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
@@ -346,9 +348,22 @@ fn start_pageserver(
         }
     };
 
+    // Top-level cancellation token for the process
+    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
+
     // Set up remote storage client
     let remote_storage = create_remote_storage_client(conf)?;
 
+    // Set up deletion queue
+    let (deletion_queue, deletion_workers) = DeletionQueue::new(
+        remote_storage.clone(),
+        ControlPlaneClient::new(conf, &shutdown_pageserver),
+        conf,
+    );
+    if let Some(deletion_workers) = deletion_workers {
+        deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
+    }
+
     // Up to this point no significant I/O has been done: this should have been fast.  Record
     // duration prior to starting I/O intensive phase of startup.
     startup_checkpoint("initial", "Starting loading tenants");
@@ -379,13 +394,13 @@ fn start_pageserver(
     };
 
     // Scan the local 'tenants/' directory and start loading the tenants
-    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
-
+    let deletion_queue_client = deletion_queue.new_client();
     BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
         conf,
         TenantSharedResources {
             broker_client: broker_client.clone(),
             remote_storage: remote_storage.clone(),
+            deletion_queue_client,
         },
         order,
         shutdown_pageserver.clone(),
@@ -481,9 +496,10 @@ fn start_pageserver(
             http::routes::State::new(
                 conf,
                 http_auth.clone(),
-                remote_storage,
+                remote_storage.clone(),
                 broker_client.clone(),
                 disk_usage_eviction_state,
+                deletion_queue.new_client(),
             )
             .context("Failed to initialize router state")?,
         );
@@ -611,7 +627,12 @@ fn start_pageserver(
             // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
             // The plan is to change that over time.
             shutdown_pageserver.take();
-            BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
+            let bg_remote_storage = remote_storage.clone();
+            let bg_deletion_queue = deletion_queue.clone();
+            BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
+                bg_remote_storage.map(|_| bg_deletion_queue),
+                0,
+            ));
             unreachable!()
         }
     })
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 8ee7f28c11..ed767b764e 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -475,8 +475,8 @@ impl PageServerConfigBuilder {
         self.background_task_maximum_delay = BuilderValue::Set(delay);
     }
 
-    pub fn control_plane_api(&mut self, api: Url) {
-        self.control_plane_api = BuilderValue::Set(Some(api))
+    pub fn control_plane_api(&mut self, api: Option<Url>) {
+        self.control_plane_api = BuilderValue::Set(api)
     }
 
     pub fn build(self) -> anyhow::Result<PageServerConf> {
@@ -580,6 +580,27 @@ impl PageServerConf {
         self.workdir.join(TENANTS_SEGMENT_NAME)
     }
 
+    pub fn deletion_prefix(&self) -> PathBuf {
+        self.workdir.join("deletion")
+    }
+
+    pub fn deletion_list_path(&self, sequence: u64) -> PathBuf {
+        // Encode a version in the filename, so that if we ever switch away from JSON we can
+        // increment this.
+        const VERSION: u8 = 1;
+
+        self.deletion_prefix()
+            .join(format!("{sequence:016x}-{VERSION:02x}.list"))
+    }
+
+    pub fn deletion_header_path(&self) -> PathBuf {
+        // Encode a version in the filename, so that if we ever switch away from JSON we can
+        // increment this.
+        const VERSION: u8 = 1;
+
+        self.deletion_prefix().join(format!("header-{VERSION:02x}"))
+    }
+
     pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf {
         self.tenants_path().join(tenant_id.to_string())
     }
@@ -747,7 +768,14 @@ impl PageServerConf {
                 },
                 "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
                 "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
-                "control_plane_api" => builder.control_plane_api(parse_toml_string(key, item)?.parse().context("failed to parse control plane URL")?),
+                "control_plane_api" => {
+                    let parsed = parse_toml_string(key, item)?;
+                    if parsed.is_empty() {
+                        builder.control_plane_api(None)
+                    } else {
+                        builder.control_plane_api(Some(parsed.parse().context("failed to parse control plane URL")?))
+                    }
+                },
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index 192eb16789..555f76e523 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -1,7 +1,9 @@
 use std::collections::HashMap;
 
-use hyper::StatusCode;
-use pageserver_api::control_api::{ReAttachRequest, ReAttachResponse};
+use pageserver_api::control_api::{
+    ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
+};
+use serde::{de::DeserializeOwned, Serialize};
 use tokio_util::sync::CancellationToken;
 use url::Url;
 use utils::{
@@ -12,25 +14,34 @@ use utils::{
 
 use crate::config::PageServerConf;
 
-// Backoffs when control plane requests do not succeed: compromise between reducing load
-// on control plane, and retrying frequently when we are blocked on a control plane
-// response to make progress.
-const BACKOFF_INCREMENT: f64 = 0.1;
-const BACKOFF_MAX: f64 = 10.0;
-
 /// The Pageserver's client for using the control plane API: this is a small subset
 /// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
-pub(crate) struct ControlPlaneClient {
+pub struct ControlPlaneClient {
     http_client: reqwest::Client,
     base_url: Url,
     node_id: NodeId,
     cancel: CancellationToken,
 }
 
+/// Represent operations which internally retry on all errors other than
+/// cancellation token firing: the only way they can fail is ShuttingDown.
+pub enum RetryForeverError {
+    ShuttingDown,
+}
+
+#[async_trait::async_trait]
+pub trait ControlPlaneGenerationsApi {
+    async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError>;
+    async fn validate(
+        &self,
+        tenants: Vec<(TenantId, Generation)>,
+    ) -> Result<HashMap<TenantId, bool>, RetryForeverError>;
+}
+
 impl ControlPlaneClient {
     /// A None return value indicates that the input `conf` object does not have control
     /// plane API enabled.
-    pub(crate) fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
+    pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
         let mut url = match conf.control_plane_api.as_ref() {
             Some(u) => u.clone(),
             None => return None,
@@ -54,27 +65,62 @@ impl ControlPlaneClient {
         })
     }
 
-    async fn try_re_attach(
+    async fn retry_http_forever<R, T>(
         &self,
-        url: Url,
-        request: &ReAttachRequest,
-    ) -> anyhow::Result<ReAttachResponse> {
-        match self.http_client.post(url).json(request).send().await {
-            Err(e) => Err(anyhow::Error::from(e)),
-            Ok(r) => {
-                if r.status() == StatusCode::OK {
-                    r.json::<ReAttachResponse>()
-                        .await
-                        .map_err(anyhow::Error::from)
-                } else {
-                    Err(anyhow::anyhow!("Unexpected status {}", r.status()))
-                }
+        url: &url::Url,
+        request: R,
+    ) -> Result<T, RetryForeverError>
+    where
+        R: Serialize,
+        T: DeserializeOwned,
+    {
+        #[derive(thiserror::Error, Debug)]
+        enum RemoteAttemptError {
+            #[error("shutdown")]
+            Shutdown,
+            #[error("remote: {0}")]
+            Remote(reqwest::Error),
+        }
+
+        match backoff::retry(
+            || async {
+                let response = self
+                    .http_client
+                    .post(url.clone())
+                    .json(&request)
+                    .send()
+                    .await
+                    .map_err(RemoteAttemptError::Remote)?;
+
+                response
+                    .error_for_status_ref()
+                    .map_err(RemoteAttemptError::Remote)?;
+                response
+                    .json::<T>()
+                    .await
+                    .map_err(RemoteAttemptError::Remote)
+            },
+            |_| false,
+            3,
+            u32::MAX,
+            "calling control plane generation validation API",
+            backoff::Cancel::new(self.cancel.clone(), || RemoteAttemptError::Shutdown),
+        )
+        .await
+        {
+            Err(RemoteAttemptError::Shutdown) => Err(RetryForeverError::ShuttingDown),
+            Err(RemoteAttemptError::Remote(_)) => {
+                panic!("We retry forever, this should never be reached");
             }
+            Ok(r) => Ok(r),
         }
     }
+}
 
-    /// Block until we get a successful response
-    pub(crate) async fn re_attach(&self) -> anyhow::Result<HashMap<TenantId, Generation>> {
+#[async_trait::async_trait]
+impl ControlPlaneGenerationsApi for ControlPlaneClient {
+    /// Block until we get a successful response, or error out if we are shut down
+    async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
         let re_attach_path = self
             .base_url
             .join("re-attach")
@@ -83,37 +129,47 @@ impl ControlPlaneClient {
             node_id: self.node_id,
         };
 
-        let mut attempt = 0;
-        loop {
-            let result = self.try_re_attach(re_attach_path.clone(), &request).await;
-            match result {
-                Ok(res) => {
-                    tracing::info!(
-                        "Received re-attach response with {} tenants",
-                        res.tenants.len()
-                    );
+        let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
+        tracing::info!(
+            "Received re-attach response with {} tenants",
+            response.tenants.len()
+        );
 
-                    return Ok(res
-                        .tenants
-                        .into_iter()
-                        .map(|t| (t.id, Generation::new(t.generation)))
-                        .collect::<HashMap<_, _>>());
-                }
-                Err(e) => {
-                    tracing::error!("Error re-attaching tenants, retrying: {e:#}");
-                    backoff::exponential_backoff(
-                        attempt,
-                        BACKOFF_INCREMENT,
-                        BACKOFF_MAX,
-                        &self.cancel,
-                    )
-                    .await;
-                    if self.cancel.is_cancelled() {
-                        return Err(anyhow::anyhow!("Shutting down"));
-                    }
-                    attempt += 1;
-                }
-            }
-        }
+        Ok(response
+            .tenants
+            .into_iter()
+            .map(|t| (t.id, Generation::new(t.generation)))
+            .collect::<HashMap<_, _>>())
+    }
+
+    /// Block until we get a successful response, or error out if we are shut down
+    async fn validate(
+        &self,
+        tenants: Vec<(TenantId, Generation)>,
+    ) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
+        let re_attach_path = self
+            .base_url
+            .join("validate")
+            .expect("Failed to build validate path");
+
+        let request = ValidateRequest {
+            tenants: tenants
+                .into_iter()
+                .map(|(id, gen)| ValidateRequestTenant {
+                    id,
+                    gen: gen
+                        .into()
+                        .expect("Generation should always be valid for a Tenant doing deletions"),
+                })
+                .collect(),
+        };
+
+        let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
+
+        Ok(response
+            .tenants
+            .into_iter()
+            .map(|rt| (rt.id, rt.valid))
+            .collect())
     }
 }
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
new file mode 100644
index 0000000000..4c0d399789
--- /dev/null
+++ b/pageserver/src/deletion_queue.rs
@@ -0,0 +1,1312 @@
+mod deleter;
+mod list_writer;
+mod validator;
+
+use std::collections::HashMap;
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::Duration;
+
+use crate::control_plane_client::ControlPlaneGenerationsApi;
+use crate::metrics;
+use crate::tenant::remote_timeline_client::remote_layer_path;
+use crate::tenant::remote_timeline_client::remote_timeline_path;
+use crate::virtual_file::VirtualFile;
+use anyhow::Context;
+use hex::FromHex;
+use remote_storage::{GenericRemoteStorage, RemotePath};
+use serde::Deserialize;
+use serde::Serialize;
+use serde_with::serde_as;
+use thiserror::Error;
+use tokio;
+use tokio_util::sync::CancellationToken;
+use tracing::Instrument;
+use tracing::{self, debug, error};
+use utils::crashsafe::path_with_suffix_extension;
+use utils::generation::Generation;
+use utils::id::{TenantId, TimelineId};
+use utils::lsn::AtomicLsn;
+use utils::lsn::Lsn;
+
+use self::deleter::Deleter;
+use self::list_writer::DeletionOp;
+use self::list_writer::ListWriter;
+use self::list_writer::RecoverOp;
+use self::validator::Validator;
+use deleter::DeleterMessage;
+use list_writer::ListWriterQueueMessage;
+use validator::ValidatorQueueMessage;
+
+use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName};
+
+// TODO: adminstrative "panic button" config property to disable all deletions
+// TODO: configurable for how long to wait before executing deletions
+
+/// We aggregate object deletions from many tenants in one place, for several reasons:
+/// - Coalesce deletions into fewer DeleteObjects calls
+/// - Enable Tenant/Timeline lifetimes to be shorter than the time it takes
+///   to flush any outstanding deletions.
+/// - Globally control throughput of deletions, as these are a low priority task: do
+///   not compete with the same S3 clients/connections used for higher priority uploads.
+/// - Enable gating deletions on validation of a tenant's generation number, to make
+///   it safe to multi-attach tenants (see docs/rfcs/025-generation-numbers.md)
+///
+/// There are two kinds of deletion: deferred and immediate.  A deferred deletion
+/// may be intentionally delayed to protect passive readers of S3 data, and is
+/// subject to a generation number validation step.  An immediate deletion is
+/// ready to execute immediately, and is only queued up so that it can be coalesced
+/// with other deletions in flight.
+///
+/// Deferred deletions pass through three steps:
+/// - ListWriter: accumulate deletion requests from Timelines, and batch them up into
+///   DeletionLists, which are persisted to disk.
+/// - Validator: accumulate deletion lists, and validate them en-masse prior to passing
+///   the keys in the list onward for actual deletion.  Also validate remote_consistent_lsn
+///   updates for running timelines.
+/// - Deleter: accumulate object keys that the validator has validated, and execute them in
+///   batches of 1000 keys via DeleteObjects.
+///
+/// Non-deferred deletions, such as during timeline deletion, bypass the first
+/// two stages and are passed straight into the Deleter.
+///
+/// Internally, each stage is joined by a channel to the next.  On disk, there is only
+/// one queue (of DeletionLists), which is written by the frontend and consumed
+/// by the backend.
+#[derive(Clone)]
+pub struct DeletionQueue {
+    client: DeletionQueueClient,
+
+    // Parent cancellation token for the tokens passed into background workers
+    cancel: CancellationToken,
+}
+
+/// Opaque wrapper around individual worker tasks, to avoid making the
+/// worker objects themselves public
+pub struct DeletionQueueWorkers<C>
+where
+    C: ControlPlaneGenerationsApi + Send + Sync,
+{
+    frontend: ListWriter,
+    backend: Validator<C>,
+    executor: Deleter,
+}
+
+impl<C> DeletionQueueWorkers<C>
+where
+    C: ControlPlaneGenerationsApi + Send + Sync + 'static,
+{
+    pub fn spawn_with(mut self, runtime: &tokio::runtime::Handle) -> tokio::task::JoinHandle<()> {
+        let jh_frontend = runtime.spawn(async move {
+            self.frontend
+                .background()
+                .instrument(tracing::info_span!(parent:None, "deletion frontend"))
+                .await
+        });
+        let jh_backend = runtime.spawn(async move {
+            self.backend
+                .background()
+                .instrument(tracing::info_span!(parent:None, "deletion backend"))
+                .await
+        });
+        let jh_executor = runtime.spawn(async move {
+            self.executor
+                .background()
+                .instrument(tracing::info_span!(parent:None, "deletion executor"))
+                .await
+        });
+
+        runtime.spawn({
+            async move {
+                jh_frontend.await.expect("error joining frontend worker");
+                jh_backend.await.expect("error joining backend worker");
+                drop(jh_executor.await.expect("error joining executor worker"));
+            }
+        })
+    }
+}
+
+/// A FlushOp is just a oneshot channel, where we send the transmit side down
+/// another channel, and the receive side will receive a message when the channel
+/// we're flushing has reached the FlushOp we sent into it.
+///
+/// The only extra behavior beyond the channel is that the notify() method does not
+/// return an error when the receive side has been dropped, because in this use case
+/// it is harmless (the code that initiated the flush no longer cares about the result).
+#[derive(Debug)]
+struct FlushOp {
+    tx: tokio::sync::oneshot::Sender<()>,
+}
+
+impl FlushOp {
+    fn new() -> (Self, tokio::sync::oneshot::Receiver<()>) {
+        let (tx, rx) = tokio::sync::oneshot::channel::<()>();
+        (Self { tx }, rx)
+    }
+
+    fn notify(self) {
+        if self.tx.send(()).is_err() {
+            // oneshot channel closed. This is legal: a client could be destroyed while waiting for a flush.
+            debug!("deletion queue flush from dropped client");
+        };
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct DeletionQueueClient {
+    tx: tokio::sync::mpsc::Sender<ListWriterQueueMessage>,
+    executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,
+
+    lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct TenantDeletionList {
+    /// For each Timeline, a list of key fragments to append to the timeline remote path
+    /// when reconstructing a full key
+    #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
+    timelines: HashMap<TimelineId, Vec<String>>,
+
+    /// The generation in which this deletion was emitted: note that this may not be the
+    /// same as the generation of any layers being deleted.  The generation of the layer
+    /// has already been absorbed into the keys in `objects`
+    generation: Generation,
+}
+
+impl TenantDeletionList {
+    pub(crate) fn len(&self) -> usize {
+        self.timelines.values().map(|v| v.len()).sum()
+    }
+}
+
+/// For HashMaps using a `hex` compatible key, where we would like to encode the key as a string
+fn to_hex_map<S, V, I>(input: &HashMap<I, V>, serializer: S) -> Result<S::Ok, S::Error>
+where
+    S: serde::Serializer,
+    V: Serialize,
+    I: AsRef<[u8]>,
+{
+    let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone()));
+
+    transformed
+        .collect::<HashMap<String, &V>>()
+        .serialize(serializer)
+}
+
+/// For HashMaps using a FromHex key, where we would like to decode the key
+fn from_hex_map<'de, D, V, I>(deserializer: D) -> Result<HashMap<I, V>, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+    V: Deserialize<'de>,
+    I: FromHex + std::hash::Hash + Eq,
+{
+    let hex_map = HashMap::<String, V>::deserialize(deserializer)?;
+    hex_map
+        .into_iter()
+        .map(|(k, v)| {
+            I::from_hex(k)
+                .map(|k| (k, v))
+                .map_err(|_| serde::de::Error::custom("Invalid hex ID"))
+        })
+        .collect()
+}
+
+/// Files ending with this suffix will be ignored and erased
+/// during recovery as startup.
+const TEMP_SUFFIX: &str = ".tmp";
+
+#[serde_as]
+#[derive(Debug, Serialize, Deserialize)]
+struct DeletionList {
+    /// Serialization version, for future use
+    version: u8,
+
+    /// Used for constructing a unique key for each deletion list we write out.
+    sequence: u64,
+
+    /// To avoid repeating tenant/timeline IDs in every key, we store keys in
+    /// nested HashMaps by TenantTimelineID.  Each Tenant only appears once
+    /// with one unique generation ID: if someone tries to push a second generation
+    /// ID for the same tenant, we will start a new DeletionList.
+    #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
+    tenants: HashMap<TenantId, TenantDeletionList>,
+
+    /// Avoid having to walk `tenants` to calculate the number of keys in
+    /// the nested deletion lists
+    size: usize,
+
+    /// Set to true when the list has undergone validation with the control
+    /// plane and the remaining contents of `tenants` are valid.  A list may
+    /// also be implicitly marked valid by DeletionHeader.validated_sequence
+    /// advancing to >= DeletionList.sequence
+    #[serde(default)]
+    #[serde(skip_serializing_if = "std::ops::Not::not")]
+    validated: bool,
+}
+
+#[serde_as]
+#[derive(Debug, Serialize, Deserialize)]
+struct DeletionHeader {
+    /// Serialization version, for future use
+    version: u8,
+
+    /// The highest sequence number (inclusive) that has been validated.  All deletion
+    /// lists on disk with a sequence <= this value are safe to execute.
+    validated_sequence: u64,
+}
+
+impl DeletionHeader {
+    const VERSION_LATEST: u8 = 1;
+
+    fn new(validated_sequence: u64) -> Self {
+        Self {
+            version: Self::VERSION_LATEST,
+            validated_sequence,
+        }
+    }
+
+    async fn save(&self, conf: &'static PageServerConf) -> anyhow::Result<()> {
+        debug!("Saving deletion list header {:?}", self);
+        let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?;
+        let header_path = conf.deletion_header_path();
+        let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
+        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
+            .await
+            .map_err(Into::into)
+    }
+}
+
+impl DeletionList {
+    const VERSION_LATEST: u8 = 1;
+    fn new(sequence: u64) -> Self {
+        Self {
+            version: Self::VERSION_LATEST,
+            sequence,
+            tenants: HashMap::new(),
+            size: 0,
+            validated: false,
+        }
+    }
+
+    fn is_empty(&self) -> bool {
+        self.tenants.is_empty()
+    }
+
+    fn len(&self) -> usize {
+        self.size
+    }
+
+    /// Returns true if the push was accepted, false if the caller must start a new
+    /// deletion list.
+    fn push(
+        &mut self,
+        tenant: &TenantId,
+        timeline: &TimelineId,
+        generation: Generation,
+        objects: &mut Vec<RemotePath>,
+    ) -> bool {
+        if objects.is_empty() {
+            // Avoid inserting an empty TimelineDeletionList: this preserves the property
+            // that if we have no keys, then self.objects is empty (used in Self::is_empty)
+            return true;
+        }
+
+        let tenant_entry = self
+            .tenants
+            .entry(*tenant)
+            .or_insert_with(|| TenantDeletionList {
+                timelines: HashMap::new(),
+                generation,
+            });
+
+        if tenant_entry.generation != generation {
+            // Only one generation per tenant per list: signal to
+            // caller to start a new list.
+            return false;
+        }
+
+        let timeline_entry = tenant_entry
+            .timelines
+            .entry(*timeline)
+            .or_insert_with(Vec::new);
+
+        let timeline_remote_path = remote_timeline_path(tenant, timeline);
+
+        self.size += objects.len();
+        timeline_entry.extend(objects.drain(..).map(|p| {
+            p.strip_prefix(&timeline_remote_path)
+                .expect("Timeline paths always start with the timeline prefix")
+                .to_string_lossy()
+                .to_string()
+        }));
+        true
+    }
+
+    fn into_remote_paths(self) -> Vec<RemotePath> {
+        let mut result = Vec::new();
+        for (tenant, tenant_deletions) in self.tenants.into_iter() {
+            for (timeline, timeline_layers) in tenant_deletions.timelines.into_iter() {
+                let timeline_remote_path = remote_timeline_path(&tenant, &timeline);
+                result.extend(
+                    timeline_layers
+                        .into_iter()
+                        .map(|l| timeline_remote_path.join(&PathBuf::from(l))),
+                );
+            }
+        }
+
+        result
+    }
+
+    async fn save(&self, conf: &'static PageServerConf) -> anyhow::Result<()> {
+        let path = conf.deletion_list_path(self.sequence);
+        let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX);
+
+        let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
+        VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
+            .await
+            .map_err(Into::into)
+    }
+}
+
+impl std::fmt::Display for DeletionList {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "DeletionList<seq={}, tenants={}, keys={}>",
+            self.sequence,
+            self.tenants.len(),
+            self.size
+        )
+    }
+}
+
+struct PendingLsn {
+    projected: Lsn,
+    result_slot: Arc<AtomicLsn>,
+}
+
+struct TenantLsnState {
+    timelines: HashMap<TimelineId, PendingLsn>,
+
+    // In what generation was the most recent update proposed?
+    generation: Generation,
+}
+
+#[derive(Default)]
+struct VisibleLsnUpdates {
+    tenants: HashMap<TenantId, TenantLsnState>,
+}
+
+impl VisibleLsnUpdates {
+    fn new() -> Self {
+        Self {
+            tenants: HashMap::new(),
+        }
+    }
+}
+
+impl std::fmt::Debug for VisibleLsnUpdates {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "VisibleLsnUpdates({} tenants)", self.tenants.len())
+    }
+}
+
+#[derive(Error, Debug)]
+pub enum DeletionQueueError {
+    #[error("Deletion queue unavailable during shutdown")]
+    ShuttingDown,
+}
+
+impl DeletionQueueClient {
+    pub(crate) fn broken() -> Self {
+        // Channels whose receivers are immediately dropped.
+        let (tx, _rx) = tokio::sync::mpsc::channel(1);
+        let (executor_tx, _executor_rx) = tokio::sync::mpsc::channel(1);
+        Self {
+            tx,
+            executor_tx,
+            lsn_table: Arc::default(),
+        }
+    }
+
+    /// This is cancel-safe.  If you drop the future before it completes, the message
+    /// is not pushed, although in the context of the deletion queue it doesn't matter: once
+    /// we decide to do a deletion the decision is always final.
+    async fn do_push<T>(
+        &self,
+        queue: &tokio::sync::mpsc::Sender<T>,
+        msg: T,
+    ) -> Result<(), DeletionQueueError> {
+        match queue.send(msg).await {
+            Ok(_) => Ok(()),
+            Err(e) => {
+                // This shouldn't happen, we should shut down all tenants before
+                // we shut down the global delete queue.  If we encounter a bug like this,
+                // we may leak objects as deletions won't be processed.
+                error!("Deletion queue closed while pushing, shutting down? ({e})");
+                Err(DeletionQueueError::ShuttingDown)
+            }
+        }
+    }
+
+    pub(crate) async fn recover(
+        &self,
+        attached_tenants: HashMap<TenantId, Generation>,
+    ) -> Result<(), DeletionQueueError> {
+        self.do_push(
+            &self.tx,
+            ListWriterQueueMessage::Recover(RecoverOp { attached_tenants }),
+        )
+        .await
+    }
+
+    /// When a Timeline wishes to update the remote_consistent_lsn that it exposes to the outside
+    /// world, it must validate its generation number before doing so.  Rather than do this synchronously,
+    /// we allow the timeline to publish updates at will via this API, and then read back what LSN was most
+    /// recently validated separately.
+    ///
+    /// In this function we publish the LSN to the `projected` field of the timeline's entry in the VisibleLsnUpdates.  The
+    /// backend will later wake up and notice that the tenant's generation requires validation.
+    pub(crate) async fn update_remote_consistent_lsn(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        current_generation: Generation,
+        lsn: Lsn,
+        result_slot: Arc<AtomicLsn>,
+    ) {
+        let mut locked = self
+            .lsn_table
+            .write()
+            .expect("Lock should never be poisoned");
+
+        let tenant_entry = locked.tenants.entry(tenant_id).or_insert(TenantLsnState {
+            timelines: HashMap::new(),
+            generation: current_generation,
+        });
+
+        if tenant_entry.generation != current_generation {
+            // Generation might have changed if we were detached and then re-attached: in this case,
+            // state from the previous generation cannot be trusted.
+            tenant_entry.timelines.clear();
+            tenant_entry.generation = current_generation;
+        }
+
+        tenant_entry.timelines.insert(
+            timeline_id,
+            PendingLsn {
+                projected: lsn,
+                result_slot,
+            },
+        );
+    }
+
+    /// Submit a list of layers for deletion: this function will return before the deletion is
+    /// persistent, but it may be executed at any time after this function enters: do not push
+    /// layers until you're sure they can be deleted safely (i.e. remote metadata no longer
+    /// references them).
+    ///
+    /// The `current_generation` is the generation of this pageserver's current attachment.  The
+    /// generations in `layers` are the generations in which those layers were written.
+    pub(crate) async fn push_layers(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        current_generation: Generation,
+        layers: Vec<(LayerFileName, Generation)>,
+    ) -> Result<(), DeletionQueueError> {
+        if current_generation.is_none() {
+            debug!("Enqueuing deletions in legacy mode, skipping queue");
+            let mut layer_paths = Vec::new();
+            for (layer, generation) in layers {
+                layer_paths.push(remote_layer_path(
+                    &tenant_id,
+                    &timeline_id,
+                    &layer,
+                    generation,
+                ));
+            }
+            self.push_immediate(layer_paths).await?;
+            return self.flush_immediate().await;
+        }
+
+        metrics::DELETION_QUEUE
+            .keys_submitted
+            .inc_by(layers.len() as u64);
+        self.do_push(
+            &self.tx,
+            ListWriterQueueMessage::Delete(DeletionOp {
+                tenant_id,
+                timeline_id,
+                layers,
+                generation: current_generation,
+                objects: Vec::new(),
+            }),
+        )
+        .await
+    }
+
+    /// This is cancel-safe.  If you drop the future the flush may still happen in the background.
+    async fn do_flush<T>(
+        &self,
+        queue: &tokio::sync::mpsc::Sender<T>,
+        msg: T,
+        rx: tokio::sync::oneshot::Receiver<()>,
+    ) -> Result<(), DeletionQueueError> {
+        self.do_push(queue, msg).await?;
+        if rx.await.is_err() {
+            // This shouldn't happen if tenants are shut down before deletion queue.  If we
+            // encounter a bug like this, then a flusher will incorrectly believe it has flushed
+            // when it hasn't, possibly leading to leaking objects.
+            error!("Deletion queue dropped flush op while client was still waiting");
+            Err(DeletionQueueError::ShuttingDown)
+        } else {
+            Ok(())
+        }
+    }
+
+    /// Wait until all previous deletions are persistent (either executed, or written to a DeletionList)
+    ///
+    /// This is cancel-safe.  If you drop the future the flush may still happen in the background.
+    pub async fn flush(&self) -> Result<(), DeletionQueueError> {
+        let (flush_op, rx) = FlushOp::new();
+        self.do_flush(&self.tx, ListWriterQueueMessage::Flush(flush_op), rx)
+            .await
+    }
+
+    // Wait until all previous deletions are executed
+    pub(crate) async fn flush_execute(&self) -> Result<(), DeletionQueueError> {
+        debug!("flush_execute: flushing to deletion lists...");
+        // Flush any buffered work to deletion lists
+        self.flush().await?;
+
+        // Flush the backend into the executor of deletion lists
+        let (flush_op, rx) = FlushOp::new();
+        debug!("flush_execute: flushing backend...");
+        self.do_flush(&self.tx, ListWriterQueueMessage::FlushExecute(flush_op), rx)
+            .await?;
+        debug!("flush_execute: finished flushing backend...");
+
+        // Flush any immediate-mode deletions (the above backend flush will only flush
+        // the executor if deletions had flowed through the backend)
+        debug!("flush_execute: flushing execution...");
+        let (flush_op, rx) = FlushOp::new();
+        self.do_flush(&self.executor_tx, DeleterMessage::Flush(flush_op), rx)
+            .await?;
+        debug!("flush_execute: finished flushing execution...");
+        Ok(())
+    }
+
+    /// This interface bypasses the persistent deletion queue, and any validation
+    /// that this pageserver is still elegible to execute the deletions.  It is for
+    /// use in timeline deletions, where the control plane is telling us we may
+    /// delete everything in the timeline.
+    ///
+    /// DO NOT USE THIS FROM GC OR COMPACTION CODE.  Use the regular `push_layers`.
+    pub(crate) async fn push_immediate(
+        &self,
+        objects: Vec<RemotePath>,
+    ) -> Result<(), DeletionQueueError> {
+        metrics::DELETION_QUEUE
+            .keys_submitted
+            .inc_by(objects.len() as u64);
+        self.executor_tx
+            .send(DeleterMessage::Delete(objects))
+            .await
+            .map_err(|_| DeletionQueueError::ShuttingDown)
+    }
+
+    /// Companion to push_immediate.  When this returns Ok, all prior objects sent
+    /// into push_immediate have been deleted from remote storage.
+    pub(crate) async fn flush_immediate(&self) -> Result<(), DeletionQueueError> {
+        let (flush_op, rx) = FlushOp::new();
+        self.executor_tx
+            .send(DeleterMessage::Flush(flush_op))
+            .await
+            .map_err(|_| DeletionQueueError::ShuttingDown)?;
+
+        rx.await.map_err(|_| DeletionQueueError::ShuttingDown)
+    }
+}
+
+impl DeletionQueue {
+    pub fn new_client(&self) -> DeletionQueueClient {
+        self.client.clone()
+    }
+
+    /// Caller may use the returned object to construct clients with new_client.
+    /// Caller should tokio::spawn the background() members of the two worker objects returned:
+    /// we don't spawn those inside new() so that the caller can use their runtime/spans of choice.
+    ///
+    /// If remote_storage is None, then the returned workers will also be None.
+    pub fn new<C>(
+        remote_storage: Option<GenericRemoteStorage>,
+        control_plane_client: Option<C>,
+        conf: &'static PageServerConf,
+    ) -> (Self, Option<DeletionQueueWorkers<C>>)
+    where
+        C: ControlPlaneGenerationsApi + Send + Sync,
+    {
+        // Deep channel: it consumes deletions from all timelines and we do not want to block them
+        let (tx, rx) = tokio::sync::mpsc::channel(16384);
+
+        // Shallow channel: it carries DeletionLists which each contain up to thousands of deletions
+        let (backend_tx, backend_rx) = tokio::sync::mpsc::channel(16);
+
+        // Shallow channel: it carries lists of paths, and we expect the main queueing to
+        // happen in the backend (persistent), not in this queue.
+        let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16);
+
+        let lsn_table = Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new()));
+
+        // The deletion queue has an independent cancellation token to
+        // the general pageserver shutdown token, because it stays alive a bit
+        // longer to flush after Tenants have all been torn down.
+        let cancel = CancellationToken::new();
+
+        let remote_storage = match remote_storage {
+            None => {
+                return (
+                    Self {
+                        client: DeletionQueueClient {
+                            tx,
+                            executor_tx,
+                            lsn_table: lsn_table.clone(),
+                        },
+                        cancel,
+                    },
+                    None,
+                )
+            }
+            Some(r) => r,
+        };
+
+        (
+            Self {
+                client: DeletionQueueClient {
+                    tx,
+                    executor_tx: executor_tx.clone(),
+                    lsn_table: lsn_table.clone(),
+                },
+                cancel: cancel.clone(),
+            },
+            Some(DeletionQueueWorkers {
+                frontend: ListWriter::new(conf, rx, backend_tx, cancel.clone()),
+                backend: Validator::new(
+                    conf,
+                    backend_rx,
+                    executor_tx,
+                    control_plane_client,
+                    lsn_table.clone(),
+                    cancel.clone(),
+                ),
+                executor: Deleter::new(remote_storage, executor_rx, cancel.clone()),
+            }),
+        )
+    }
+
+    pub async fn shutdown(&mut self, timeout: Duration) {
+        self.cancel.cancel();
+
+        match tokio::time::timeout(timeout, self.client.flush()).await {
+            Ok(Ok(())) => {
+                tracing::info!("Deletion queue flushed successfully on shutdown")
+            }
+            Ok(Err(DeletionQueueError::ShuttingDown)) => {
+                // This is not harmful for correctness, but is unexpected: the deletion
+                // queue's workers should stay alive as long as there are any client handles instantiated.
+                tracing::warn!("Deletion queue stopped prematurely");
+            }
+            Err(_timeout) => {
+                tracing::warn!("Timed out flushing deletion queue on shutdown")
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use hex_literal::hex;
+    use std::{
+        io::ErrorKind,
+        path::{Path, PathBuf},
+        time::Duration,
+    };
+    use tracing::info;
+
+    use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
+    use tokio::task::JoinHandle;
+
+    use crate::{
+        control_plane_client::RetryForeverError,
+        repository::Key,
+        tenant::{
+            harness::TenantHarness, remote_timeline_client::remote_timeline_path,
+            storage_layer::DeltaFileName,
+        },
+    };
+
+    use super::*;
+    pub const TIMELINE_ID: TimelineId =
+        TimelineId::from_array(hex!("11223344556677881122334455667788"));
+
+    pub const EXAMPLE_LAYER_NAME: LayerFileName = LayerFileName::Delta(DeltaFileName {
+        key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
+        lsn_range: Lsn(0x00000000016B59D8)..Lsn(0x00000000016B5A51),
+    });
+
+    // When you need a second layer in a test.
+    pub const EXAMPLE_LAYER_NAME_ALT: LayerFileName = LayerFileName::Delta(DeltaFileName {
+        key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
+        lsn_range: Lsn(0x00000000016B5A51)..Lsn(0x00000000016B5A61),
+    });
+
+    struct TestSetup {
+        harness: TenantHarness,
+        remote_fs_dir: PathBuf,
+        storage: GenericRemoteStorage,
+        mock_control_plane: MockControlPlane,
+        deletion_queue: DeletionQueue,
+        worker_join: JoinHandle<()>,
+    }
+
+    impl TestSetup {
+        /// Simulate a pageserver restart by destroying and recreating the deletion queue
+        async fn restart(&mut self) {
+            let (deletion_queue, workers) = DeletionQueue::new(
+                Some(self.storage.clone()),
+                Some(self.mock_control_plane.clone()),
+                self.harness.conf,
+            );
+
+            tracing::debug!("Spawning worker for new queue queue");
+            let worker_join = workers
+                .unwrap()
+                .spawn_with(&tokio::runtime::Handle::current());
+
+            let old_worker_join = std::mem::replace(&mut self.worker_join, worker_join);
+            let old_deletion_queue = std::mem::replace(&mut self.deletion_queue, deletion_queue);
+
+            tracing::debug!("Joining worker from previous queue");
+            old_deletion_queue.cancel.cancel();
+            old_worker_join
+                .await
+                .expect("Failed to join workers for previous deletion queue");
+        }
+
+        fn set_latest_generation(&self, gen: Generation) {
+            let tenant_id = self.harness.tenant_id;
+            self.mock_control_plane
+                .latest_generation
+                .lock()
+                .unwrap()
+                .insert(tenant_id, gen);
+        }
+
+        /// Returns remote layer file name, suitable for use in assert_remote_files
+        fn write_remote_layer(
+            &self,
+            file_name: LayerFileName,
+            gen: Generation,
+        ) -> anyhow::Result<String> {
+            let tenant_id = self.harness.tenant_id;
+            let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+            let remote_timeline_path = self.remote_fs_dir.join(relative_remote_path.get_path());
+            std::fs::create_dir_all(&remote_timeline_path)?;
+            let remote_layer_file_name = format!("{}{}", file_name, gen.get_suffix());
+
+            let content: Vec<u8> = format!("placeholder contents of {file_name}").into();
+
+            std::fs::write(
+                remote_timeline_path.join(remote_layer_file_name.clone()),
+                content,
+            )?;
+
+            Ok(remote_layer_file_name)
+        }
+    }
+
+    #[derive(Debug, Clone)]
+    struct MockControlPlane {
+        pub latest_generation: std::sync::Arc<std::sync::Mutex<HashMap<TenantId, Generation>>>,
+    }
+
+    impl MockControlPlane {
+        fn new() -> Self {
+            Self {
+                latest_generation: Arc::default(),
+            }
+        }
+    }
+
+    #[async_trait::async_trait]
+    impl ControlPlaneGenerationsApi for MockControlPlane {
+        #[allow(clippy::diverging_sub_expression)] // False positive via async_trait
+        async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
+            unimplemented!()
+        }
+        async fn validate(
+            &self,
+            tenants: Vec<(TenantId, Generation)>,
+        ) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
+            let mut result = HashMap::new();
+
+            let latest_generation = self.latest_generation.lock().unwrap();
+
+            for (tenant_id, generation) in tenants {
+                if let Some(latest) = latest_generation.get(&tenant_id) {
+                    result.insert(tenant_id, *latest == generation);
+                }
+            }
+
+            Ok(result)
+        }
+    }
+
+    fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
+        let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}")));
+        let harness = TenantHarness::create(test_name)?;
+
+        // We do not load() the harness: we only need its config and remote_storage
+
+        // Set up a GenericRemoteStorage targetting a directory
+        let remote_fs_dir = harness.conf.workdir.join("remote_fs");
+        std::fs::create_dir_all(remote_fs_dir)?;
+        let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
+        let storage_config = RemoteStorageConfig {
+            max_concurrent_syncs: std::num::NonZeroUsize::new(
+                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
+            )
+            .unwrap(),
+            max_sync_errors: std::num::NonZeroU32::new(
+                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
+            )
+            .unwrap(),
+            storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
+        };
+        let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
+
+        let mock_control_plane = MockControlPlane::new();
+
+        let (deletion_queue, worker) = DeletionQueue::new(
+            Some(storage.clone()),
+            Some(mock_control_plane.clone()),
+            harness.conf,
+        );
+
+        let worker = worker.unwrap();
+        let worker_join = worker.spawn_with(&tokio::runtime::Handle::current());
+
+        Ok(TestSetup {
+            harness,
+            remote_fs_dir,
+            storage,
+            mock_control_plane,
+            deletion_queue,
+            worker_join,
+        })
+    }
+
+    // TODO: put this in a common location so that we can share with remote_timeline_client's tests
+    fn assert_remote_files(expected: &[&str], remote_path: &Path) {
+        let mut expected: Vec<String> = expected.iter().map(|x| String::from(*x)).collect();
+        expected.sort();
+
+        let mut found: Vec<String> = Vec::new();
+        let dir = match std::fs::read_dir(remote_path) {
+            Ok(d) => d,
+            Err(e) => {
+                if e.kind() == ErrorKind::NotFound {
+                    if expected.is_empty() {
+                        // We are asserting prefix is empty: it is expected that the dir is missing
+                        return;
+                    } else {
+                        assert_eq!(expected, Vec::<String>::new());
+                        unreachable!();
+                    }
+                } else {
+                    panic!(
+                        "Unexpected error listing {}: {e}",
+                        remote_path.to_string_lossy()
+                    );
+                }
+            }
+        };
+
+        for entry in dir.flatten() {
+            let entry_name = entry.file_name();
+            let fname = entry_name.to_str().unwrap();
+            found.push(String::from(fname));
+        }
+        found.sort();
+
+        assert_eq!(expected, found);
+    }
+
+    fn assert_local_files(expected: &[&str], directory: &Path) {
+        let dir = match std::fs::read_dir(directory) {
+            Ok(d) => d,
+            Err(_) => {
+                assert_eq!(expected, &Vec::<String>::new());
+                return;
+            }
+        };
+        let mut found = Vec::new();
+        for dentry in dir {
+            let dentry = dentry.unwrap();
+            let file_name = dentry.file_name();
+            let file_name_str = file_name.to_string_lossy();
+            found.push(file_name_str.to_string());
+        }
+        found.sort();
+        assert_eq!(expected, found);
+    }
+
+    #[tokio::test]
+    async fn deletion_queue_smoke() -> anyhow::Result<()> {
+        // Basic test that the deletion queue processes the deletions we pass into it
+        let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
+        let client = ctx.deletion_queue.new_client();
+        client.recover(HashMap::new()).await?;
+
+        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
+        let tenant_id = ctx.harness.tenant_id;
+
+        let content: Vec<u8> = "victim1 contents".into();
+        let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+        let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
+        let deletion_prefix = ctx.harness.conf.deletion_prefix();
+
+        // Exercise the distinction between the generation of the layers
+        // we delete, and the generation of the running Tenant.
+        let layer_generation = Generation::new(0xdeadbeef);
+        let now_generation = Generation::new(0xfeedbeef);
+
+        let remote_layer_file_name_1 =
+            format!("{}{}", layer_file_name_1, layer_generation.get_suffix());
+
+        // Set mock control plane state to valid for our generation
+        ctx.set_latest_generation(now_generation);
+
+        // Inject a victim file to remote storage
+        info!("Writing");
+        std::fs::create_dir_all(&remote_timeline_path)?;
+        std::fs::write(
+            remote_timeline_path.join(remote_layer_file_name_1.clone()),
+            content,
+        )?;
+        assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
+
+        // File should still be there after we push it to the queue (we haven't pushed enough to flush anything)
+        info!("Pushing");
+        client
+            .push_layers(
+                tenant_id,
+                TIMELINE_ID,
+                now_generation,
+                [(layer_file_name_1.clone(), layer_generation)].to_vec(),
+            )
+            .await?;
+        assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
+
+        assert_local_files(&[], &deletion_prefix);
+
+        // File should still be there after we write a deletion list (we haven't pushed enough to execute anything)
+        info!("Flushing");
+        client.flush().await?;
+        assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
+        assert_local_files(&["0000000000000001-01.list"], &deletion_prefix);
+
+        // File should go away when we execute
+        info!("Flush-executing");
+        client.flush_execute().await?;
+        assert_remote_files(&[], &remote_timeline_path);
+        assert_local_files(&["header-01"], &deletion_prefix);
+
+        // Flushing on an empty queue should succeed immediately, and not write any lists
+        info!("Flush-executing on empty");
+        client.flush_execute().await?;
+        assert_local_files(&["header-01"], &deletion_prefix);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn deletion_queue_validation() -> anyhow::Result<()> {
+        let ctx = setup("deletion_queue_validation").expect("Failed test setup");
+        let client = ctx.deletion_queue.new_client();
+        client.recover(HashMap::new()).await?;
+
+        // Generation that the control plane thinks is current
+        let latest_generation = Generation::new(0xdeadbeef);
+        // Generation that our DeletionQueue thinks the tenant is running with
+        let stale_generation = latest_generation.previous();
+        // Generation that our example layer file was written with
+        let layer_generation = stale_generation.previous();
+
+        ctx.set_latest_generation(latest_generation);
+
+        let tenant_id = ctx.harness.tenant_id;
+        let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+        let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
+
+        // Initial state: a remote layer exists
+        let remote_layer_name = ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
+        assert_remote_files(&[&remote_layer_name], &remote_timeline_path);
+
+        tracing::debug!("Pushing...");
+        client
+            .push_layers(
+                tenant_id,
+                TIMELINE_ID,
+                stale_generation,
+                [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
+            )
+            .await?;
+
+        // We enqueued the operation in a stale generation: it should have failed validation
+        tracing::debug!("Flushing...");
+        tokio::time::timeout(Duration::from_secs(5), client.flush_execute()).await??;
+        assert_remote_files(&[&remote_layer_name], &remote_timeline_path);
+
+        tracing::debug!("Pushing...");
+        client
+            .push_layers(
+                tenant_id,
+                TIMELINE_ID,
+                latest_generation,
+                [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
+            )
+            .await?;
+
+        // We enqueued the operation in a fresh generation: it should have passed validation
+        tracing::debug!("Flushing...");
+        tokio::time::timeout(Duration::from_secs(5), client.flush_execute()).await??;
+        assert_remote_files(&[], &remote_timeline_path);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn deletion_queue_recovery() -> anyhow::Result<()> {
+        // Basic test that the deletion queue processes the deletions we pass into it
+        let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
+        let client = ctx.deletion_queue.new_client();
+        client.recover(HashMap::new()).await?;
+
+        let tenant_id = ctx.harness.tenant_id;
+
+        let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+        let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
+        let deletion_prefix = ctx.harness.conf.deletion_prefix();
+
+        let layer_generation = Generation::new(0xdeadbeef);
+        let now_generation = Generation::new(0xfeedbeef);
+
+        // Inject a deletion in the generation before generation_now: after restart,
+        // this deletion should _not_ get executed (only the immediately previous
+        // generation gets that treatment)
+        let remote_layer_file_name_historical =
+            ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
+        client
+            .push_layers(
+                tenant_id,
+                TIMELINE_ID,
+                now_generation.previous(),
+                [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
+            )
+            .await?;
+
+        // Inject a deletion in the generation before generation_now: after restart,
+        // this deletion should get executed, because we execute deletions in the
+        // immediately previous generation on the same node.
+        let remote_layer_file_name_previous =
+            ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?;
+        client
+            .push_layers(
+                tenant_id,
+                TIMELINE_ID,
+                now_generation,
+                [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_generation)].to_vec(),
+            )
+            .await?;
+
+        client.flush().await?;
+        assert_remote_files(
+            &[
+                &remote_layer_file_name_historical,
+                &remote_layer_file_name_previous,
+            ],
+            &remote_timeline_path,
+        );
+
+        // Different generatinos for the same tenant will cause two separate
+        // deletion lists to be emitted.
+        assert_local_files(
+            &["0000000000000001-01.list", "0000000000000002-01.list"],
+            &deletion_prefix,
+        );
+
+        // Simulate a node restart: the latest generation advances
+        let now_generation = now_generation.next();
+        ctx.set_latest_generation(now_generation);
+
+        // Restart the deletion queue
+        drop(client);
+        ctx.restart().await;
+        let client = ctx.deletion_queue.new_client();
+        client
+            .recover(HashMap::from([(tenant_id, now_generation)]))
+            .await?;
+
+        info!("Flush-executing");
+        client.flush_execute().await?;
+        // The deletion from immediately prior generation was executed, the one from
+        // an older generation was not.
+        assert_remote_files(&[&remote_layer_file_name_historical], &remote_timeline_path);
+        Ok(())
+    }
+}
+
+/// A lightweight queue which can issue ordinary DeletionQueueClient objects, but doesn't do any persistence
+/// or coalescing, and doesn't actually execute any deletions unless you call pump() to kick it.
+#[cfg(test)]
+pub(crate) mod mock {
+    use tracing::info;
+
+    use crate::tenant::remote_timeline_client::remote_layer_path;
+
+    use super::*;
+    use std::sync::{
+        atomic::{AtomicUsize, Ordering},
+        Arc,
+    };
+
+    pub struct ConsumerState {
+        rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+        executor_rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
+    }
+
+    impl ConsumerState {
+        async fn consume(&mut self, remote_storage: &GenericRemoteStorage) -> usize {
+            let mut executed = 0;
+
+            info!("Executing all pending deletions");
+
+            // Transform all executor messages to generic frontend messages
+            while let Ok(msg) = self.executor_rx.try_recv() {
+                match msg {
+                    DeleterMessage::Delete(objects) => {
+                        for path in objects {
+                            match remote_storage.delete(&path).await {
+                                Ok(_) => {
+                                    debug!("Deleted {path}");
+                                }
+                                Err(e) => {
+                                    error!("Failed to delete {path}, leaking object! ({e})");
+                                }
+                            }
+                            executed += 1;
+                        }
+                    }
+                    DeleterMessage::Flush(flush_op) => {
+                        flush_op.notify();
+                    }
+                }
+            }
+
+            while let Ok(msg) = self.rx.try_recv() {
+                match msg {
+                    ListWriterQueueMessage::Delete(op) => {
+                        let mut objects = op.objects;
+                        for (layer, generation) in op.layers {
+                            objects.push(remote_layer_path(
+                                &op.tenant_id,
+                                &op.timeline_id,
+                                &layer,
+                                generation,
+                            ));
+                        }
+
+                        for path in objects {
+                            info!("Executing deletion {path}");
+                            match remote_storage.delete(&path).await {
+                                Ok(_) => {
+                                    debug!("Deleted {path}");
+                                }
+                                Err(e) => {
+                                    error!("Failed to delete {path}, leaking object! ({e})");
+                                }
+                            }
+                            executed += 1;
+                        }
+                    }
+                    ListWriterQueueMessage::Flush(op) => {
+                        op.notify();
+                    }
+                    ListWriterQueueMessage::FlushExecute(op) => {
+                        // We have already executed all prior deletions because mock does them inline
+                        op.notify();
+                    }
+                    ListWriterQueueMessage::Recover(_) => {
+                        // no-op in mock
+                    }
+                }
+                info!("All pending deletions have been executed");
+            }
+
+            executed
+        }
+    }
+
+    pub struct MockDeletionQueue {
+        tx: tokio::sync::mpsc::Sender<ListWriterQueueMessage>,
+        executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,
+        executed: Arc<AtomicUsize>,
+        remote_storage: Option<GenericRemoteStorage>,
+        consumer: std::sync::Mutex<ConsumerState>,
+        lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
+    }
+
+    impl MockDeletionQueue {
+        pub fn new(remote_storage: Option<GenericRemoteStorage>) -> Self {
+            let (tx, rx) = tokio::sync::mpsc::channel(16384);
+            let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16384);
+
+            let executed = Arc::new(AtomicUsize::new(0));
+
+            Self {
+                tx,
+                executor_tx,
+                executed,
+                remote_storage,
+                consumer: std::sync::Mutex::new(ConsumerState { rx, executor_rx }),
+                lsn_table: Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())),
+            }
+        }
+
+        pub fn get_executed(&self) -> usize {
+            self.executed.load(Ordering::Relaxed)
+        }
+
+        #[allow(clippy::await_holding_lock)]
+        pub async fn pump(&self) {
+            if let Some(remote_storage) = &self.remote_storage {
+                // Permit holding mutex across await, because this is only ever
+                // called once at a time in tests.
+                let mut locked = self.consumer.lock().unwrap();
+                let count = locked.consume(remote_storage).await;
+                self.executed.fetch_add(count, Ordering::Relaxed);
+            }
+        }
+
+        pub(crate) fn new_client(&self) -> DeletionQueueClient {
+            DeletionQueueClient {
+                tx: self.tx.clone(),
+                executor_tx: self.executor_tx.clone(),
+                lsn_table: self.lsn_table.clone(),
+            }
+        }
+    }
+}
diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs
new file mode 100644
index 0000000000..5c6e7dc9d7
--- /dev/null
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -0,0 +1,156 @@
+//! The deleter is the final stage in the deletion queue.  It accumulates remote
+//! paths to delete, and periodically executes them in batches of up to 1000
+//! using the DeleteObjects request.
+//!
+//! Its purpose is to increase efficiency of remote storage I/O by issuing a smaller
+//! number of full-sized DeleteObjects requests, rather than a larger number of
+//! smaller requests.
+
+use remote_storage::GenericRemoteStorage;
+use remote_storage::RemotePath;
+use remote_storage::MAX_KEYS_PER_DELETE;
+use std::time::Duration;
+use tokio_util::sync::CancellationToken;
+use tracing::info;
+use tracing::warn;
+
+use crate::metrics;
+
+use super::DeletionQueueError;
+use super::FlushOp;
+
+const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
+
+pub(super) enum DeleterMessage {
+    Delete(Vec<RemotePath>),
+    Flush(FlushOp),
+}
+
+/// Non-persistent deletion queue, for coalescing multiple object deletes into
+/// larger DeleteObjects requests.
+pub(super) struct Deleter {
+    // Accumulate up to 1000 keys for the next deletion operation
+    accumulator: Vec<RemotePath>,
+
+    rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
+
+    cancel: CancellationToken,
+    remote_storage: GenericRemoteStorage,
+}
+
+impl Deleter {
+    pub(super) fn new(
+        remote_storage: GenericRemoteStorage,
+        rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
+        cancel: CancellationToken,
+    ) -> Self {
+        Self {
+            remote_storage,
+            rx,
+            cancel,
+            accumulator: Vec::new(),
+        }
+    }
+
+    /// Wrap the remote `delete_objects` with a failpoint
+    async fn remote_delete(&self) -> Result<(), anyhow::Error> {
+        fail::fail_point!("deletion-queue-before-execute", |_| {
+            info!("Skipping execution, failpoint set");
+            metrics::DELETION_QUEUE
+                .remote_errors
+                .with_label_values(&["failpoint"])
+                .inc();
+            Err(anyhow::anyhow!("failpoint hit"))
+        });
+
+        self.remote_storage.delete_objects(&self.accumulator).await
+    }
+
+    /// Block until everything in accumulator has been executed
+    async fn flush(&mut self) -> Result<(), DeletionQueueError> {
+        while !self.accumulator.is_empty() && !self.cancel.is_cancelled() {
+            match self.remote_delete().await {
+                Ok(()) => {
+                    // Note: we assume that the remote storage layer returns Ok(()) if some
+                    // or all of the deleted objects were already gone.
+                    metrics::DELETION_QUEUE
+                        .keys_executed
+                        .inc_by(self.accumulator.len() as u64);
+                    info!(
+                        "Executed deletion batch {}..{}",
+                        self.accumulator
+                            .first()
+                            .expect("accumulator should be non-empty"),
+                        self.accumulator
+                            .last()
+                            .expect("accumulator should be non-empty"),
+                    );
+                    self.accumulator.clear();
+                }
+                Err(e) => {
+                    warn!("DeleteObjects request failed: {e:#}, will retry");
+                    metrics::DELETION_QUEUE
+                        .remote_errors
+                        .with_label_values(&["execute"])
+                        .inc();
+                }
+            };
+        }
+        if self.cancel.is_cancelled() {
+            // Expose an error because we may not have actually flushed everything
+            Err(DeletionQueueError::ShuttingDown)
+        } else {
+            Ok(())
+        }
+    }
+
+    pub(super) async fn background(&mut self) -> Result<(), DeletionQueueError> {
+        self.accumulator.reserve(MAX_KEYS_PER_DELETE);
+
+        loop {
+            if self.cancel.is_cancelled() {
+                return Err(DeletionQueueError::ShuttingDown);
+            }
+
+            let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
+                Ok(Some(m)) => m,
+                Ok(None) => {
+                    // All queue senders closed
+                    info!("Shutting down");
+                    return Err(DeletionQueueError::ShuttingDown);
+                }
+                Err(_) => {
+                    // Timeout, we hit deadline to execute whatever we have in hand.  These functions will
+                    // return immediately if no work is pending
+                    self.flush().await?;
+
+                    continue;
+                }
+            };
+
+            match msg {
+                DeleterMessage::Delete(mut list) => {
+                    while !list.is_empty() || self.accumulator.len() == MAX_KEYS_PER_DELETE {
+                        if self.accumulator.len() == MAX_KEYS_PER_DELETE {
+                            self.flush().await?;
+                            // If we have received this number of keys, proceed with attempting to execute
+                            assert_eq!(self.accumulator.len(), 0);
+                        }
+
+                        let available_slots = MAX_KEYS_PER_DELETE - self.accumulator.len();
+                        let take_count = std::cmp::min(available_slots, list.len());
+                        for path in list.drain(list.len() - take_count..) {
+                            self.accumulator.push(path);
+                        }
+                    }
+                }
+                DeleterMessage::Flush(flush_op) => {
+                    // If flush() errors, we drop the flush_op and the caller will get
+                    // an error recv()'ing their oneshot channel.
+                    self.flush().await?;
+                    flush_op.notify();
+                }
+            }
+        }
+    }
+}
diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs
new file mode 100644
index 0000000000..618a59f8fe
--- /dev/null
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -0,0 +1,487 @@
+//! The list writer is the first stage in the deletion queue.  It accumulates
+//! layers to delete, and periodically writes out these layers into a persistent
+//! DeletionList.
+//!
+//! The purpose of writing DeletionLists is to decouple the decision to
+//! delete an object from the validation required to execute it: even if
+//! validation is not possible, e.g. due to a control plane outage, we can
+//! still persist our intent to delete an object, in a way that would
+//! survive a restart.
+//!
+//! DeletionLists are passed onwards to the Validator.
+
+use super::DeletionHeader;
+use super::DeletionList;
+use super::FlushOp;
+use super::ValidatorQueueMessage;
+
+use std::collections::HashMap;
+use std::fs::create_dir_all;
+use std::time::Duration;
+
+use regex::Regex;
+use remote_storage::RemotePath;
+use tokio_util::sync::CancellationToken;
+use tracing::debug;
+use tracing::info;
+use tracing::warn;
+use utils::generation::Generation;
+use utils::id::TenantId;
+use utils::id::TimelineId;
+
+use crate::config::PageServerConf;
+use crate::deletion_queue::TEMP_SUFFIX;
+use crate::metrics;
+use crate::tenant::remote_timeline_client::remote_layer_path;
+use crate::tenant::storage_layer::LayerFileName;
+
+// The number of keys in a DeletionList before we will proactively persist it
+// (without reaching a flush deadline).  This aims to deliver objects of the order
+// of magnitude 1MB when we are under heavy delete load.
+const DELETION_LIST_TARGET_SIZE: usize = 16384;
+
+// Ordinarily, we only flush to DeletionList periodically, to bound the window during
+// which we might leak objects from not flushing a DeletionList after
+// the objects are already unlinked from timeline metadata.
+const FRONTEND_DEFAULT_TIMEOUT: Duration = Duration::from_millis(10000);
+
+// If someone is waiting for a flush to DeletionList, only delay a little to accumulate
+// more objects before doing the flush.
+const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100);
+
+#[derive(Debug)]
+pub(super) struct DeletionOp {
+    pub(super) tenant_id: TenantId,
+    pub(super) timeline_id: TimelineId,
+    // `layers` and `objects` are both just lists of objects.  `layers` is used if you do not
+    // have a config object handy to project it to a remote key, and need the consuming worker
+    // to do it for you.
+    pub(super) layers: Vec<(LayerFileName, Generation)>,
+    pub(super) objects: Vec<RemotePath>,
+
+    /// The _current_ generation of the Tenant attachment in which we are enqueuing
+    /// this deletion.
+    pub(super) generation: Generation,
+}
+
+#[derive(Debug)]
+pub(super) struct RecoverOp {
+    pub(super) attached_tenants: HashMap<TenantId, Generation>,
+}
+
+#[derive(Debug)]
+pub(super) enum ListWriterQueueMessage {
+    Delete(DeletionOp),
+    // Wait until all prior deletions make it into a persistent DeletionList
+    Flush(FlushOp),
+    // Wait until all prior deletions have been executed (i.e. objects are actually deleted)
+    FlushExecute(FlushOp),
+    // Call once after re-attaching to control plane, to notify the deletion queue about
+    // latest attached generations & load any saved deletion lists from disk.
+    Recover(RecoverOp),
+}
+
+pub(super) struct ListWriter {
+    conf: &'static PageServerConf,
+
+    // Incoming frontend requests to delete some keys
+    rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+
+    // Outbound requests to the backend to execute deletion lists we have composed.
+    tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
+
+    // The list we are currently building, contains a buffer of keys to delete
+    // and our next sequence number
+    pending: DeletionList,
+
+    // These FlushOps should notify the next time we flush
+    pending_flushes: Vec<FlushOp>,
+
+    // Worker loop is torn down when this fires.
+    cancel: CancellationToken,
+
+    // Safety guard to do recovery exactly once
+    recovered: bool,
+}
+
+impl ListWriter {
+    // Initially DeletionHeader.validated_sequence is zero.  The place we start our
+    // sequence numbers must be higher than that.
+    const BASE_SEQUENCE: u64 = 1;
+
+    pub(super) fn new(
+        conf: &'static PageServerConf,
+        rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+        tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
+        cancel: CancellationToken,
+    ) -> Self {
+        Self {
+            pending: DeletionList::new(Self::BASE_SEQUENCE),
+            conf,
+            rx,
+            tx,
+            pending_flushes: Vec::new(),
+            cancel,
+            recovered: false,
+        }
+    }
+
+    /// Try to flush `list` to persistent storage
+    ///
+    /// This does not return errors, because on failure to flush we do not lose
+    /// any state: flushing will be retried implicitly on the next deadline
+    async fn flush(&mut self) {
+        if self.pending.is_empty() {
+            for f in self.pending_flushes.drain(..) {
+                f.notify();
+            }
+            return;
+        }
+
+        match self.pending.save(self.conf).await {
+            Ok(_) => {
+                info!(sequence = self.pending.sequence, "Stored deletion list");
+
+                for f in self.pending_flushes.drain(..) {
+                    f.notify();
+                }
+
+                // Take the list we've accumulated, replace it with a fresh list for the next sequence
+                let next_list = DeletionList::new(self.pending.sequence + 1);
+                let list = std::mem::replace(&mut self.pending, next_list);
+
+                if let Err(e) = self.tx.send(ValidatorQueueMessage::Delete(list)).await {
+                    // This is allowed to fail: it will only happen if the backend worker is shut down,
+                    // so we can just drop this on the floor.
+                    info!("Deletion list dropped, this is normal during shutdown ({e:#})");
+                }
+            }
+            Err(e) => {
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
+                warn!(
+                    sequence = self.pending.sequence,
+                    "Failed to write deletion list, will retry later ({e:#})"
+                );
+            }
+        }
+    }
+
+    /// Load the header, to learn the sequence number up to which deletions
+    /// have been validated.  We will apply validated=true to DeletionLists
+    /// <= this sequence when loading them.
+    ///
+    /// It is not an error for the header to not exist: we return None, and
+    /// the caller should act as if validated_sequence is 0
+    async fn load_validated_sequence(&self) -> Result<Option<u64>, anyhow::Error> {
+        let header_path = self.conf.deletion_header_path();
+        match tokio::fs::read(&header_path).await {
+            Ok(header_bytes) => {
+                match serde_json::from_slice::<DeletionHeader>(&header_bytes) {
+                    Ok(h) => Ok(Some(h.validated_sequence)),
+                    Err(e) => {
+                        warn!(
+                            "Failed to deserialize deletion header, ignoring {}: {e:#}",
+                            header_path.display()
+                        );
+                        // This should never happen unless we make a mistake with our serialization.
+                        // Ignoring a deletion header is not consequential for correctnes because all deletions
+                        // are ultimately allowed to fail: worst case we leak some objects for the scrubber to clean up.
+                        metrics::DELETION_QUEUE.unexpected_errors.inc();
+                        Ok(None)
+                    }
+                }
+            }
+            Err(e) => {
+                if e.kind() == std::io::ErrorKind::NotFound {
+                    debug!(
+                        "Deletion header {} not found, first start?",
+                        header_path.display()
+                    );
+                    Ok(None)
+                } else {
+                    Err(anyhow::anyhow!(e))
+                }
+            }
+        }
+    }
+
+    async fn recover(
+        &mut self,
+        attached_tenants: HashMap<TenantId, Generation>,
+    ) -> Result<(), anyhow::Error> {
+        debug!(
+            "recovering with {} attached tenants",
+            attached_tenants.len()
+        );
+
+        // Load the header
+        let validated_sequence = self.load_validated_sequence().await?.unwrap_or(0);
+
+        self.pending.sequence = validated_sequence + 1;
+
+        let deletion_directory = self.conf.deletion_prefix();
+        let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
+            Ok(d) => d,
+            Err(e) => {
+                warn!(
+                    "Failed to open deletion list directory {}: {e:#}",
+                    deletion_directory.display(),
+                );
+
+                // Give up: if we can't read the deletion list directory, we probably can't
+                // write lists into it later, so the queue won't work.
+                return Err(e.into());
+            }
+        };
+
+        let list_name_pattern =
+            Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
+
+        let header_path = self.conf.deletion_header_path();
+        let mut seqs: Vec<u64> = Vec::new();
+        while let Some(dentry) = dir.next_entry().await? {
+            let file_name = dentry.file_name();
+            let dentry_str = file_name.to_string_lossy();
+
+            if Some(file_name.as_os_str()) == header_path.file_name() {
+                // Don't try and parse the header's name like a list
+                continue;
+            }
+
+            if dentry_str.ends_with(TEMP_SUFFIX) {
+                info!("Cleaning up temporary file {dentry_str}");
+                let absolute_path = deletion_directory.join(dentry.file_name());
+                if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
+                    // Non-fatal error: we will just leave the file behind but not
+                    // try and load it.
+                    warn!(
+                        "Failed to clean up temporary file {}: {e:#}",
+                        absolute_path.display()
+                    );
+                }
+
+                continue;
+            }
+
+            let file_name = dentry.file_name().to_owned();
+            let basename = file_name.to_string_lossy();
+            let seq_part = if let Some(m) = list_name_pattern.captures(&basename) {
+                m.name("sequence")
+                    .expect("Non optional group should be present")
+                    .as_str()
+            } else {
+                warn!("Unexpected key in deletion queue: {basename}");
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
+                continue;
+            };
+
+            let seq: u64 = match u64::from_str_radix(seq_part, 16) {
+                Ok(s) => s,
+                Err(e) => {
+                    warn!("Malformed key '{basename}': {e}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
+                    continue;
+                }
+            };
+            seqs.push(seq);
+        }
+        seqs.sort();
+
+        // Start our next deletion list from after the last location validated by
+        // previous process lifetime, or after the last location found (it is updated
+        // below after enumerating the deletion lists)
+        self.pending.sequence = validated_sequence + 1;
+        if let Some(max_list_seq) = seqs.last() {
+            self.pending.sequence = std::cmp::max(self.pending.sequence, max_list_seq + 1);
+        }
+
+        for s in seqs {
+            let list_path = self.conf.deletion_list_path(s);
+
+            let list_bytes = tokio::fs::read(&list_path).await?;
+
+            let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
+                Ok(l) => l,
+                Err(e) => {
+                    // Drop the list on the floor: any objects it referenced will be left behind
+                    // for scrubbing to clean up.  This should never happen unless we have a serialization bug.
+                    warn!(sequence = s, "Failed to deserialize deletion list: {e}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
+                    continue;
+                }
+            };
+
+            if deletion_list.sequence <= validated_sequence {
+                // If the deletion list falls below valid_seq, we may assume that it was
+                // already validated the last time this pageserver ran.  Otherwise, we still
+                // load it, as it may still contain content valid in this generation.
+                deletion_list.validated = true;
+            } else {
+                // Special case optimization: if a tenant is still attached, and no other
+                // generation was issued to another node in the interval while we restarted,
+                // then we may treat deletion lists from the previous generation as if they
+                // belong to our currently attached generation, and proceed to validate & execute.
+                for (tenant_id, tenant_list) in &mut deletion_list.tenants {
+                    if let Some(attached_gen) = attached_tenants.get(tenant_id) {
+                        if attached_gen.previous() == tenant_list.generation {
+                            tenant_list.generation = *attached_gen;
+                        }
+                    }
+                }
+            }
+
+            info!(
+                validated = deletion_list.validated,
+                sequence = deletion_list.sequence,
+                "Recovered deletion list"
+            );
+
+            // We will drop out of recovery if this fails: it indicates that we are shutting down
+            // or the backend has panicked
+            metrics::DELETION_QUEUE
+                .keys_submitted
+                .inc_by(deletion_list.len() as u64);
+            self.tx
+                .send(ValidatorQueueMessage::Delete(deletion_list))
+                .await?;
+        }
+
+        info!(next_sequence = self.pending.sequence, "Replay complete");
+
+        Ok(())
+    }
+
+    /// This is the front-end ingest, where we bundle up deletion requests into DeletionList
+    /// and write them out, for later validation by the backend and execution by the executor.
+    pub(super) async fn background(&mut self) {
+        info!("Started deletion frontend worker");
+
+        // Synchronous, but we only do it once per process lifetime so it's tolerable
+        if let Err(e) = create_dir_all(&self.conf.deletion_prefix()) {
+            tracing::error!(
+                "Failed to create deletion list directory {}, deletions will not be executed ({e})",
+                self.conf.deletion_prefix().display()
+            );
+            metrics::DELETION_QUEUE.unexpected_errors.inc();
+            return;
+        }
+
+        while !self.cancel.is_cancelled() {
+            let timeout = if self.pending_flushes.is_empty() {
+                FRONTEND_DEFAULT_TIMEOUT
+            } else {
+                FRONTEND_FLUSHING_TIMEOUT
+            };
+
+            let msg = match tokio::time::timeout(timeout, self.rx.recv()).await {
+                Ok(Some(msg)) => msg,
+                Ok(None) => {
+                    // Queue sender destroyed, shutting down
+                    break;
+                }
+                Err(_) => {
+                    // Hit deadline, flush.
+                    self.flush().await;
+                    continue;
+                }
+            };
+
+            match msg {
+                ListWriterQueueMessage::Delete(op) => {
+                    assert!(
+                        self.recovered,
+                        "Cannot process deletions before recovery.  This is a bug."
+                    );
+
+                    debug!(
+                        "Delete: ingesting {} layers, {} other objects",
+                        op.layers.len(),
+                        op.objects.len()
+                    );
+
+                    let mut layer_paths = Vec::new();
+                    for (layer, generation) in op.layers {
+                        layer_paths.push(remote_layer_path(
+                            &op.tenant_id,
+                            &op.timeline_id,
+                            &layer,
+                            generation,
+                        ));
+                    }
+                    layer_paths.extend(op.objects);
+
+                    if !self.pending.push(
+                        &op.tenant_id,
+                        &op.timeline_id,
+                        op.generation,
+                        &mut layer_paths,
+                    ) {
+                        self.flush().await;
+                        let retry_succeeded = self.pending.push(
+                            &op.tenant_id,
+                            &op.timeline_id,
+                            op.generation,
+                            &mut layer_paths,
+                        );
+                        if !retry_succeeded {
+                            // Unexpected: after we flush, we should have
+                            // drained self.pending, so a conflict on
+                            // generation numbers should be impossible.
+                            tracing::error!(
+                                "Failed to enqueue deletions, leaking objects.  This is a bug."
+                            );
+                            metrics::DELETION_QUEUE.unexpected_errors.inc();
+                        }
+                    }
+                }
+                ListWriterQueueMessage::Flush(op) => {
+                    if self.pending.is_empty() {
+                        // Execute immediately
+                        debug!("Flush: No pending objects, flushing immediately");
+                        op.notify()
+                    } else {
+                        // Execute next time we flush
+                        debug!("Flush: adding to pending flush list for next deadline flush");
+                        self.pending_flushes.push(op);
+                    }
+                }
+                ListWriterQueueMessage::FlushExecute(op) => {
+                    debug!("FlushExecute: passing through to backend");
+                    // We do not flush to a deletion list here: the client sends a Flush before the FlushExecute
+                    if let Err(e) = self.tx.send(ValidatorQueueMessage::Flush(op)).await {
+                        info!("Can't flush, shutting down ({e})");
+                        // Caller will get error when their oneshot sender was dropped.
+                    }
+                }
+                ListWriterQueueMessage::Recover(op) => {
+                    if self.recovered {
+                        tracing::error!(
+                            "Deletion queue recovery called more than once.  This is a bug."
+                        );
+                        metrics::DELETION_QUEUE.unexpected_errors.inc();
+                        // Non-fatal: although this is a bug, since we did recovery at least once we may proceed.
+                        continue;
+                    }
+
+                    if let Err(e) = self.recover(op.attached_tenants).await {
+                        // This should only happen in truly unrecoverable cases, like the recovery finding that the backend
+                        // queue receiver has been dropped, or something is critically broken with
+                        // the local filesystem holding deletion lists.
+                        info!(
+                            "Deletion queue recover aborted, deletion queue will not proceed ({e})"
+                        );
+                        metrics::DELETION_QUEUE.unexpected_errors.inc();
+                        return;
+                    } else {
+                        self.recovered = true;
+                    }
+                }
+            }
+
+            if self.pending.len() > DELETION_LIST_TARGET_SIZE || !self.pending_flushes.is_empty() {
+                self.flush().await;
+            }
+        }
+        info!("Deletion queue shut down.");
+    }
+}
diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs
new file mode 100644
index 0000000000..64603045d2
--- /dev/null
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -0,0 +1,414 @@
+//! The validator is responsible for validating DeletionLists for execution,
+//! based on whethe the generation in the DeletionList is still the latest
+//! generation for a tenant.
+//!
+//! The purpose of validation is to ensure split-brain safety in the cluster
+//! of pageservers: a deletion may only be executed if the tenant generation
+//! that originated it is still current.  See docs/rfcs/025-generation-numbers.md
+//! The purpose of accumulating lists before validating them is to reduce load
+//! on the control plane API by issuing fewer, larger requests.
+//!
+//! In addition to validating DeletionLists, the validator validates updates to remote_consistent_lsn
+//! for timelines: these are logically deletions because the safekeepers use remote_consistent_lsn
+//! to decide when old
+//!
+//! Deletions are passed onward to the Deleter.
+
+use std::collections::HashMap;
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::Duration;
+
+use tokio_util::sync::CancellationToken;
+use tracing::debug;
+use tracing::info;
+use tracing::warn;
+
+use crate::config::PageServerConf;
+use crate::control_plane_client::ControlPlaneGenerationsApi;
+use crate::control_plane_client::RetryForeverError;
+use crate::metrics;
+
+use super::deleter::DeleterMessage;
+use super::DeletionHeader;
+use super::DeletionList;
+use super::DeletionQueueError;
+use super::FlushOp;
+use super::VisibleLsnUpdates;
+
+// After this length of time, do any validation work that is pending,
+// even if we haven't accumulated many keys to delete.
+//
+// This also causes updates to remote_consistent_lsn to be validated, even
+// if there were no deletions enqueued.
+const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
+
+// If we have received this number of keys, proceed with attempting to execute
+const AUTOFLUSH_KEY_COUNT: usize = 16384;
+
+#[derive(Debug)]
+pub(super) enum ValidatorQueueMessage {
+    Delete(DeletionList),
+    Flush(FlushOp),
+}
+pub(super) struct Validator<C>
+where
+    C: ControlPlaneGenerationsApi,
+{
+    conf: &'static PageServerConf,
+    rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
+    tx: tokio::sync::mpsc::Sender<DeleterMessage>,
+
+    // Client for calling into control plane API for validation of deletes
+    control_plane_client: Option<C>,
+
+    // DeletionLists which are waiting generation validation.  Not safe to
+    // execute until [`validate`] has processed them.
+    pending_lists: Vec<DeletionList>,
+
+    // DeletionLists which have passed validation and are ready to execute.
+    validated_lists: Vec<DeletionList>,
+
+    // Sum of all the lengths of lists in pending_lists
+    pending_key_count: usize,
+
+    // Lsn validation state: we read projected LSNs and write back visible LSNs
+    // after validation.  This is the LSN equivalent of `pending_validation_lists`:
+    // it is drained in [`validate`]
+    lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
+
+    // If we failed to rewrite a deletion list due to local filesystem I/O failure,
+    // we must remember that and refuse to advance our persistent validated sequence
+    // number past the failure.
+    list_write_failed: Option<u64>,
+
+    cancel: CancellationToken,
+}
+
+impl<C> Validator<C>
+where
+    C: ControlPlaneGenerationsApi,
+{
+    pub(super) fn new(
+        conf: &'static PageServerConf,
+        rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
+        tx: tokio::sync::mpsc::Sender<DeleterMessage>,
+        control_plane_client: Option<C>,
+        lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
+        cancel: CancellationToken,
+    ) -> Self {
+        Self {
+            conf,
+            rx,
+            tx,
+            control_plane_client,
+            lsn_table,
+            pending_lists: Vec::new(),
+            validated_lists: Vec::new(),
+            pending_key_count: 0,
+            list_write_failed: None,
+            cancel,
+        }
+    }
+    /// Process any outstanding validations of generations of pending LSN updates or pending
+    /// DeletionLists.
+    ///
+    /// Valid LSN updates propagate back to Timelines immediately, valid DeletionLists
+    /// go into the queue of ready-to-execute lists.
+    async fn validate(&mut self) -> Result<(), DeletionQueueError> {
+        let mut tenant_generations = HashMap::new();
+        for list in &self.pending_lists {
+            for (tenant_id, tenant_list) in &list.tenants {
+                // Note: DeletionLists are in logical time order, so generation always
+                // goes up.  By doing a simple insert() we will always end up with
+                // the latest generation seen for a tenant.
+                tenant_generations.insert(*tenant_id, tenant_list.generation);
+            }
+        }
+
+        let pending_lsn_updates = {
+            let mut lsn_table = self.lsn_table.write().expect("Lock should not be poisoned");
+            std::mem::take(&mut *lsn_table)
+        };
+        for (tenant_id, update) in &pending_lsn_updates.tenants {
+            let entry = tenant_generations
+                .entry(*tenant_id)
+                .or_insert(update.generation);
+            if update.generation > *entry {
+                *entry = update.generation;
+            }
+        }
+
+        if tenant_generations.is_empty() {
+            // No work to do
+            return Ok(());
+        }
+
+        let tenants_valid = if let Some(control_plane_client) = &self.control_plane_client {
+            match control_plane_client
+                .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
+                .await
+            {
+                Ok(tenants) => tenants,
+                Err(RetryForeverError::ShuttingDown) => {
+                    // The only way a validation call returns an error is when the cancellation token fires
+                    return Err(DeletionQueueError::ShuttingDown);
+                }
+            }
+        } else {
+            // Control plane API disabled.  In legacy mode we consider everything valid.
+            tenant_generations.keys().map(|k| (*k, true)).collect()
+        };
+
+        let mut validated_sequence: Option<u64> = None;
+
+        // Apply the validation results to the pending LSN updates
+        for (tenant_id, tenant_lsn_state) in pending_lsn_updates.tenants {
+            let validated_generation = tenant_generations
+                .get(&tenant_id)
+                .expect("Map was built from the same keys we're reading");
+
+            let valid = tenants_valid
+                .get(&tenant_id)
+                .copied()
+                // If the tenant was missing from the validation response, it has been deleted.
+                // The Timeline that requested the LSN update is probably already torn down,
+                // or will be torn down soon.  In this case, drop the update by setting valid=false.
+                .unwrap_or(false);
+
+            if valid && *validated_generation == tenant_lsn_state.generation {
+                for (_timeline_id, pending_lsn) in tenant_lsn_state.timelines {
+                    pending_lsn.result_slot.store(pending_lsn.projected);
+                }
+            } else {
+                // If we failed validation, then do not apply any of the projected updates
+                warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
+                metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
+            }
+        }
+
+        // Apply the validation results to the pending deletion lists
+        for list in &mut self.pending_lists {
+            // Filter the list based on whether the server responded valid: true.
+            // If a tenant is omitted in the response, it has been deleted, and we should
+            // proceed with deletion.
+            let mut mutated = false;
+            list.tenants.retain(|tenant_id, tenant| {
+                let validated_generation = tenant_generations
+                    .get(tenant_id)
+                    .expect("Map was built from the same keys we're reading");
+
+                // If the tenant was missing from the validation response, it has been deleted.
+                // This means that a deletion is valid, but also redundant since the tenant's
+                // objects should have already been deleted.  Treat it as invalid to drop the
+                // redundant deletion.
+                let valid = tenants_valid.get(tenant_id).copied().unwrap_or(false);
+
+                // A list is valid if it comes from the current _or previous_ generation.
+                // - The previous generation case is permitted due to how we store deletion lists locally:
+                // if we see the immediately previous generation in a locally stored deletion list,
+                // it proves that this node's disk was used for both current & previous generations,
+                // and therefore no other node was involved in between: the two generations may be
+                // logically treated as the same.
+                // - In that previous generation case, we rewrote it to the current generation
+                // in recover(), so the comparison here is simply an equality.
+
+                let this_list_valid = valid
+                    && (tenant.generation == *validated_generation);
+
+                if !this_list_valid {
+                    warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
+                    metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
+                    mutated = true;
+                }
+                this_list_valid
+            });
+            list.validated = true;
+
+            if mutated {
+                // Save the deletion list if we had to make changes due to stale generations.  The
+                // saved list is valid for execution.
+                if let Err(e) = list.save(self.conf).await {
+                    // Highly unexpected.  Could happen if e.g. disk full.
+                    // If we didn't save the trimmed list, it is _not_ valid to execute.
+                    warn!("Failed to save modified deletion list {list}: {e:#}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
+
+                    // Rather than have a complex retry process, just drop it and leak the objects,
+                    // scrubber will clean up eventually.
+                    list.tenants.clear(); // Result is a valid-but-empty list, which is a no-op for execution.
+
+                    // We must remember this failure, to prevent later writing out a header that
+                    // would imply the unwritable list was valid on disk.
+                    if self.list_write_failed.is_none() {
+                        self.list_write_failed = Some(list.sequence);
+                    }
+                }
+            }
+
+            validated_sequence = Some(list.sequence);
+        }
+
+        if let Some(validated_sequence) = validated_sequence {
+            if let Some(list_write_failed) = self.list_write_failed {
+                // Rare error case: we failed to write out a deletion list to excise invalid
+                // entries, so we cannot advance the header's valid sequence number past that point.
+                //
+                // In this state we will continue to validate, execute and delete deletion lists,
+                // we just cannot update the header.  It should be noticed and fixed by a human due to
+                // the nonzero value of our unexpected_errors metric.
+                warn!(
+                    sequence_number = list_write_failed,
+                    "Cannot write header because writing a deletion list failed earlier",
+                );
+            } else {
+                // Write the queue header to record how far validation progressed.  This avoids having
+                // to rewrite each DeletionList to set validated=true in it.
+                let header = DeletionHeader::new(validated_sequence);
+
+                // Drop result because the validated_sequence is an optimization.  If we fail to save it,
+                // then restart, we will drop some deletion lists, creating work for scrubber.
+                // The save() function logs a warning on error.
+                if let Err(e) = header.save(self.conf).await {
+                    warn!("Failed to write deletion queue header: {e:#}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
+                }
+            }
+        }
+
+        // Transfer the validated lists to the validated queue, for eventual execution
+        self.validated_lists.append(&mut self.pending_lists);
+
+        Ok(())
+    }
+
+    async fn cleanup_lists(&mut self, list_paths: Vec<PathBuf>) {
+        for list_path in list_paths {
+            debug!("Removing deletion list {}", list_path.display());
+
+            if let Err(e) = tokio::fs::remove_file(&list_path).await {
+                // Unexpected: we should have permissions and nothing else should
+                // be touching these files.  We will leave the file behind.  Subsequent
+                // pageservers will try and load it again: hopefully whatever storage
+                // issue (probably permissions) has been fixed by then.
+                tracing::error!("Failed to delete {}: {e:#}", list_path.display());
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
+                break;
+            }
+        }
+    }
+
+    async fn flush(&mut self) -> Result<(), DeletionQueueError> {
+        tracing::debug!("Flushing with {} pending lists", self.pending_lists.len());
+
+        // Issue any required generation validation calls to the control plane
+        self.validate().await?;
+
+        // After successful validation, nothing is pending: any lists that
+        // made it through validation will be in validated_lists.
+        assert!(self.pending_lists.is_empty());
+        self.pending_key_count = 0;
+
+        tracing::debug!(
+            "Validation complete, have {} validated lists",
+            self.validated_lists.len()
+        );
+
+        // Return quickly if we have no validated lists to execute.  This avoids flushing the
+        // executor when an idle backend hits its autoflush interval
+        if self.validated_lists.is_empty() {
+            return Ok(());
+        }
+
+        // Drain `validated_lists` into the executor
+        let mut executing_lists = Vec::new();
+        for list in self.validated_lists.drain(..) {
+            let list_path = self.conf.deletion_list_path(list.sequence);
+            let objects = list.into_remote_paths();
+            self.tx
+                .send(DeleterMessage::Delete(objects))
+                .await
+                .map_err(|_| DeletionQueueError::ShuttingDown)?;
+            executing_lists.push(list_path);
+        }
+
+        self.flush_executor().await?;
+
+        // Erase the deletion lists whose keys have all be deleted from remote storage
+        self.cleanup_lists(executing_lists).await;
+
+        Ok(())
+    }
+
+    async fn flush_executor(&mut self) -> Result<(), DeletionQueueError> {
+        // Flush the executor, so that all the keys referenced by these deletion lists
+        // are actually removed from remote storage.  This is a precondition to deleting
+        // the deletion lists themselves.
+        let (flush_op, rx) = FlushOp::new();
+        self.tx
+            .send(DeleterMessage::Flush(flush_op))
+            .await
+            .map_err(|_| DeletionQueueError::ShuttingDown)?;
+
+        rx.await.map_err(|_| DeletionQueueError::ShuttingDown)
+    }
+
+    pub(super) async fn background(&mut self) {
+        tracing::info!("Started deletion backend worker");
+
+        while !self.cancel.is_cancelled() {
+            let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
+                Ok(Some(m)) => m,
+                Ok(None) => {
+                    // All queue senders closed
+                    info!("Shutting down");
+                    break;
+                }
+                Err(_) => {
+                    // Timeout, we hit deadline to execute whatever we have in hand.  These functions will
+                    // return immediately if no work is pending.
+                    match self.flush().await {
+                        Ok(()) => {}
+                        Err(DeletionQueueError::ShuttingDown) => {
+                            // If we are shutting down, then auto-flush can safely be skipped
+                        }
+                    }
+
+                    continue;
+                }
+            };
+
+            match msg {
+                ValidatorQueueMessage::Delete(list) => {
+                    if list.validated {
+                        // A pre-validated list may only be seen during recovery, if we are recovering
+                        // a DeletionList whose on-disk state has validated=true
+                        self.validated_lists.push(list)
+                    } else {
+                        self.pending_key_count += list.len();
+                        self.pending_lists.push(list);
+                    }
+
+                    if self.pending_key_count > AUTOFLUSH_KEY_COUNT {
+                        match self.flush().await {
+                            Ok(()) => {}
+                            Err(DeletionQueueError::ShuttingDown) => {
+                                // If we are shutting down, then auto-flush can safely be skipped
+                            }
+                        }
+                    }
+                }
+                ValidatorQueueMessage::Flush(op) => {
+                    match self.flush().await {
+                        Ok(()) => {
+                            op.notify();
+                        }
+                        Err(DeletionQueueError::ShuttingDown) => {
+                            // If we fail due to shutting down, we will just drop `op` to propagate that status.
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 4988641d6a..f5c1224f01 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1093,6 +1093,9 @@ components:
         remote_consistent_lsn:
           type: string
           format: hex
+        remote_consistent_lsn_visible:
+          type: string
+          format: hex
         ancestor_timeline_id:
           type: string
           format: hex
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index a8e914ba08..e61a9dcf3f 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -5,6 +5,7 @@ use std::collections::HashMap;
 use std::sync::Arc;
 
 use anyhow::{anyhow, Context, Result};
+use futures::TryFutureExt;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
@@ -24,6 +25,7 @@ use super::models::{
     TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
 };
 use crate::context::{DownloadBehavior, RequestContext};
+use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
@@ -34,7 +36,7 @@ use crate::tenant::mgr::{
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::timeline::Timeline;
-use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
+use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use utils::{
@@ -61,6 +63,7 @@ pub struct State {
     remote_storage: Option<GenericRemoteStorage>,
     broker_client: storage_broker::BrokerClientChannel,
     disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
+    deletion_queue_client: DeletionQueueClient,
 }
 
 impl State {
@@ -70,6 +73,7 @@ impl State {
         remote_storage: Option<GenericRemoteStorage>,
         broker_client: storage_broker::BrokerClientChannel,
         disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
+        deletion_queue_client: DeletionQueueClient,
     ) -> anyhow::Result<Self> {
         let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
             .iter()
@@ -82,8 +86,17 @@ impl State {
             remote_storage,
             broker_client,
             disk_usage_eviction_state,
+            deletion_queue_client,
         })
     }
+
+    fn tenant_resources(&self) -> TenantSharedResources {
+        TenantSharedResources {
+            broker_client: self.broker_client.clone(),
+            remote_storage: self.remote_storage.clone(),
+            deletion_queue_client: self.deletion_queue_client.clone(),
+        }
+    }
 }
 
 #[inline(always)]
@@ -283,7 +296,12 @@ async fn build_timeline_info_common(
     };
     let current_physical_size = Some(timeline.layer_size_sum().await);
     let state = timeline.current_state();
-    let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
+    let remote_consistent_lsn_projected = timeline
+        .get_remote_consistent_lsn_projected()
+        .unwrap_or(Lsn(0));
+    let remote_consistent_lsn_visible = timeline
+        .get_remote_consistent_lsn_visible()
+        .unwrap_or(Lsn(0));
 
     let walreceiver_status = timeline.walreceiver_status();
 
@@ -293,7 +311,8 @@ async fn build_timeline_info_common(
         ancestor_timeline_id,
         ancestor_lsn,
         disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
-        remote_consistent_lsn,
+        remote_consistent_lsn: remote_consistent_lsn_projected,
+        remote_consistent_lsn_visible,
         last_record_lsn,
         prev_record_lsn: Some(timeline.get_prev_record_lsn()),
         latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
@@ -492,24 +511,23 @@ async fn tenant_attach_handler(
 
     let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
 
-    if let Some(remote_storage) = &state.remote_storage {
-        mgr::attach_tenant(
-            state.conf,
-            tenant_id,
-            generation,
-            tenant_conf,
-            state.broker_client.clone(),
-            remote_storage.clone(),
-            &ctx,
-        )
-        .instrument(info_span!("tenant_attach", %tenant_id))
-        .await?;
-    } else {
+    if state.remote_storage.is_none() {
         return Err(ApiError::BadRequest(anyhow!(
             "attach_tenant is not possible because pageserver was configured without remote storage"
         )));
     }
 
+    mgr::attach_tenant(
+        state.conf,
+        tenant_id,
+        generation,
+        tenant_conf,
+        state.tenant_resources(),
+        &ctx,
+    )
+    .instrument(info_span!("tenant_attach", %tenant_id))
+    .await?;
+
     json_response(StatusCode::ACCEPTED, ())
 }
 
@@ -570,6 +588,7 @@ async fn tenant_load_handler(
         generation,
         state.broker_client.clone(),
         state.remote_storage.clone(),
+        state.deletion_queue_client.clone(),
         &ctx,
     )
     .instrument(info_span!("load", %tenant_id))
@@ -911,8 +930,7 @@ async fn tenant_create_handler(
         tenant_conf,
         target_tenant_id,
         generation,
-        state.broker_client.clone(),
-        state.remote_storage.clone(),
+        state.tenant_resources(),
         &ctx,
     )
     .instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
@@ -1129,6 +1147,39 @@ async fn timeline_download_remote_layers_handler_get(
     json_response(StatusCode::OK, info)
 }
 
+async fn deletion_queue_flush(
+    r: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&r);
+
+    if state.remote_storage.is_none() {
+        // Nothing to do if remote storage is disabled.
+        return json_response(StatusCode::OK, ());
+    }
+
+    let execute = parse_query_param(&r, "execute")?.unwrap_or(false);
+
+    let flush = async {
+        if execute {
+            state.deletion_queue_client.flush_execute().await
+        } else {
+            state.deletion_queue_client.flush().await
+        }
+    }
+    // DeletionQueueError's only case is shutting down.
+    .map_err(|_| ApiError::ShuttingDown);
+
+    tokio::select! {
+        res = flush => {
+            res.map(|()| json_response(StatusCode::OK, ()))?
+        }
+        _ = cancel.cancelled() => {
+            Err(ApiError::ShuttingDown)
+        }
+    }
+}
+
 async fn active_timeline_of_active_tenant(
     tenant_id: TenantId,
     timeline_id: TimelineId,
@@ -1463,6 +1514,9 @@ pub fn make_router(
         .put("/v1/disk_usage_eviction/run", |r| {
             api_handler(r, disk_usage_eviction_run)
         })
+        .put("/v1/deletion_queue/flush", |r| {
+            api_handler(r, deletion_queue_flush)
+        })
         .put("/v1/tenant/:tenant_id/break", |r| {
             testing_api_handler("set tenant state to broken", r, handle_tenant_break)
         })
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 3049ad6a4e..e370e063ba 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -3,7 +3,8 @@ pub mod basebackup;
 pub mod config;
 pub mod consumption_metrics;
 pub mod context;
-mod control_plane_client;
+pub mod control_plane_client;
+pub mod deletion_queue;
 pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
@@ -27,6 +28,7 @@ pub mod failpoint_support;
 use std::path::Path;
 
 use crate::task_mgr::TaskKind;
+use deletion_queue::DeletionQueue;
 use tracing::info;
 
 /// Current storage format version
@@ -48,8 +50,8 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
 
 pub use crate::metrics::preinitialize_metrics;
 
-#[tracing::instrument]
-pub async fn shutdown_pageserver(exit_code: i32) {
+#[tracing::instrument(skip_all, fields(%exit_code))]
+pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_code: i32) {
     use std::time::Duration;
     // Shut down the libpq endpoint task. This prevents new connections from
     // being accepted.
@@ -77,6 +79,11 @@ pub async fn shutdown_pageserver(exit_code: i32) {
     )
     .await;
 
+    // Best effort to persist any outstanding deletions, to avoid leaking objects
+    if let Some(mut deletion_queue) = deletion_queue {
+        deletion_queue.shutdown(Duration::from_secs(5)).await;
+    }
+
     // Shut down the HTTP endpoint last, so that you can still check the server's
     // status while it's shutting down.
     // FIXME: We should probably stop accepting commands like attach/detach earlier.
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 98dee095a3..b085176f18 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -887,6 +887,54 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy
     .expect("failed to define a metric")
 });
 
+pub(crate) struct DeletionQueueMetrics {
+    pub(crate) keys_submitted: IntCounter,
+    pub(crate) keys_dropped: IntCounter,
+    pub(crate) keys_executed: IntCounter,
+    pub(crate) dropped_lsn_updates: IntCounter,
+    pub(crate) unexpected_errors: IntCounter,
+    pub(crate) remote_errors: IntCounterVec,
+}
+pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
+    DeletionQueueMetrics{
+
+    keys_submitted: register_int_counter!(
+        "pageserver_deletion_queue_submitted_total",
+        "Number of objects submitted for deletion"
+    )
+    .expect("failed to define a metric"),
+
+    keys_dropped: register_int_counter!(
+        "pageserver_deletion_queue_dropped_total",
+        "Number of object deletions dropped due to stale generation."
+    )
+    .expect("failed to define a metric"),
+
+    keys_executed: register_int_counter!(
+        "pageserver_deletion_queue_executed_total",
+        "Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed."
+    )
+    .expect("failed to define a metric"),
+
+    dropped_lsn_updates: register_int_counter!(
+        "pageserver_deletion_queue_dropped_lsn_updates_total",
+        "Updates to remote_consistent_lsn dropped due to stale generation number."
+    )
+    .expect("failed to define a metric"),
+    unexpected_errors: register_int_counter!(
+        "pageserver_deletion_queue_unexpected_errors_total",
+        "Number of unexpected condiions that may stall the queue: any value above zero is unexpected."
+    )
+    .expect("failed to define a metric"),
+    remote_errors: register_int_counter_vec!(
+        "pageserver_deletion_queue_remote_errors_total",
+        "Retryable remote I/O errors while executing deletions, for example 503 responses to DeleteObjects",
+        &["op_kind"],
+    )
+    .expect("failed to define a metric")
+}
+});
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
     Upload,
@@ -1675,6 +1723,9 @@ pub fn preinitialize_metrics() {
         Lazy::force(c);
     });
 
+    // Deletion queue stats
+    Lazy::force(&DELETION_QUEUE);
+
     // countervecs
     [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
         .into_iter()
diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs
index 047fa761c3..7a94c3449d 100644
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -37,7 +37,7 @@ impl Key {
             | self.field6 as i128
     }
 
-    pub fn from_i128(x: i128) -> Self {
+    pub const fn from_i128(x: i128) -> Self {
         Key {
             field1: ((x >> 120) & 0xf) as u8,
             field2: ((x >> 104) & 0xFFFF) as u32,
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 650bc119b6..017322ffb2 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -456,7 +456,7 @@ async fn task_finish(
     }
 
     if shutdown_process {
-        shutdown_pageserver(1).await;
+        shutdown_pageserver(None, 1).await;
     }
 }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1c92c618fa..47bfd4a8ef 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -57,6 +57,7 @@ use self::timeline::EvictionTaskTenantState;
 use self::timeline::TimelineResources;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
+use crate::deletion_queue::DeletionQueueClient;
 use crate::import_datadir;
 use crate::is_uninit_mark;
 use crate::metrics::TENANT_ACTIVATION;
@@ -117,7 +118,7 @@ mod span;
 
 pub mod metadata;
 mod par_fsync;
-mod remote_timeline_client;
+pub mod remote_timeline_client;
 pub mod storage_layer;
 
 pub mod config;
@@ -157,6 +158,7 @@ pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
 pub struct TenantSharedResources {
     pub broker_client: storage_broker::BrokerClientChannel,
     pub remote_storage: Option<GenericRemoteStorage>,
+    pub deletion_queue_client: DeletionQueueClient,
 }
 
 ///
@@ -197,6 +199,9 @@ pub struct Tenant {
     // provides access to timeline data sitting in the remote storage
     pub(crate) remote_storage: Option<GenericRemoteStorage>,
 
+    // Access to global deletion queue for when this tenant wants to schedule a deletion
+    deletion_queue_client: DeletionQueueClient,
+
     /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
     cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
     cached_synthetic_tenant_size: Arc<AtomicU64>,
@@ -523,15 +528,20 @@ impl Tenant {
         conf: &'static PageServerConf,
         tenant_id: TenantId,
         generation: Generation,
-        broker_client: storage_broker::BrokerClientChannel,
+        resources: TenantSharedResources,
         tenants: &'static tokio::sync::RwLock<TenantsMap>,
-        remote_storage: GenericRemoteStorage,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Tenant>> {
         // TODO dedup with spawn_load
         let tenant_conf =
             Self::load_tenant_config(conf, &tenant_id).context("load tenant config")?;
 
+        let TenantSharedResources {
+            broker_client,
+            remote_storage,
+            deletion_queue_client,
+        } = resources;
+
         let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
         let tenant = Arc::new(Tenant::new(
             TenantState::Attaching,
@@ -540,7 +550,8 @@ impl Tenant {
             wal_redo_manager,
             tenant_id,
             generation,
-            Some(remote_storage.clone()),
+            remote_storage.clone(),
+            deletion_queue_client,
         ));
 
         // Do all the hard work in the background
@@ -571,7 +582,7 @@ impl Tenant {
                 let pending_deletion = {
                     match DeleteTenantFlow::should_resume_deletion(
                         conf,
-                        Some(&remote_storage),
+                        remote_storage.as_ref(),
                         &tenant_clone,
                     )
                     .await
@@ -660,6 +671,7 @@ impl Tenant {
         for timeline_id in remote_timeline_ids {
             let client = RemoteTimelineClient::new(
                 remote_storage.clone(),
+                self.deletion_queue_client.clone(),
                 self.conf,
                 self.tenant_id,
                 timeline_id,
@@ -726,6 +738,7 @@ impl Tenant {
                 remote_metadata,
                 TimelineResources {
                     remote_client: Some(remote_client),
+                    deletion_queue_client: self.deletion_queue_client.clone(),
                 },
                 ctx,
             )
@@ -750,6 +763,7 @@ impl Tenant {
                 timeline_id,
                 &index_part.metadata,
                 Some(remote_timeline_client),
+                self.deletion_queue_client.clone(),
                 None,
             )
             .await
@@ -851,6 +865,7 @@ impl Tenant {
             tenant_id,
             Generation::broken(),
             None,
+            DeletionQueueClient::broken(),
         ))
     }
 
@@ -895,6 +910,7 @@ impl Tenant {
             tenant_id,
             generation,
             remote_storage.clone(),
+            resources.deletion_queue_client.clone(),
         );
         let tenant = Arc::new(tenant);
 
@@ -1302,6 +1318,7 @@ impl Tenant {
                                 timeline_id,
                                 &local_metadata,
                                 Some(remote_client),
+                                self.deletion_queue_client.clone(),
                                 init_order,
                             )
                             .await
@@ -1351,6 +1368,7 @@ impl Tenant {
                         timeline_id,
                         &local_metadata,
                         None,
+                        self.deletion_queue_client.clone(),
                         init_order,
                     )
                     .await
@@ -2242,6 +2260,9 @@ impl Tenant {
         Ok(timeline)
     }
 
+    // Allow too_many_arguments because a constructor's argument list naturally grows with the
+    // number of attributes in the struct: breaking these out into a builder wouldn't be helpful.
+    #[allow(clippy::too_many_arguments)]
     fn new(
         state: TenantState,
         conf: &'static PageServerConf,
@@ -2250,6 +2271,7 @@ impl Tenant {
         tenant_id: TenantId,
         generation: Generation,
         remote_storage: Option<GenericRemoteStorage>,
+        deletion_queue_client: DeletionQueueClient,
     ) -> Tenant {
         let (state, mut rx) = watch::channel(state);
 
@@ -2317,6 +2339,7 @@ impl Tenant {
             gc_cs: tokio::sync::Mutex::new(()),
             walredo_mgr,
             remote_storage,
+            deletion_queue_client,
             state,
             cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
             cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
@@ -2856,6 +2879,7 @@ impl Tenant {
         let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
             let remote_client = RemoteTimelineClient::new(
                 remote_storage.clone(),
+                self.deletion_queue_client.clone(),
                 self.conf,
                 self.tenant_id,
                 timeline_id,
@@ -2866,7 +2890,10 @@ impl Tenant {
             None
         };
 
-        TimelineResources { remote_client }
+        TimelineResources {
+            remote_client,
+            deletion_queue_client: self.deletion_queue_client.clone(),
+        }
     }
 
     /// Creates intermediate timeline structure and its files.
@@ -3322,6 +3349,7 @@ pub mod harness {
     use utils::logging;
     use utils::lsn::Lsn;
 
+    use crate::deletion_queue::mock::MockDeletionQueue;
     use crate::{
         config::PageServerConf,
         repository::Key,
@@ -3383,6 +3411,7 @@ pub mod harness {
         pub generation: Generation,
         pub remote_storage: GenericRemoteStorage,
         pub remote_fs_dir: PathBuf,
+        pub deletion_queue: MockDeletionQueue,
     }
 
     static LOG_HANDLE: OnceCell<()> = OnceCell::new();
@@ -3431,6 +3460,7 @@ pub mod harness {
                 storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
             };
             let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
+            let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));
 
             Ok(Self {
                 conf,
@@ -3439,6 +3469,7 @@ pub mod harness {
                 generation: Generation::new(0xdeadbeef),
                 remote_storage,
                 remote_fs_dir,
+                deletion_queue,
             })
         }
 
@@ -3463,6 +3494,7 @@ pub mod harness {
                 self.tenant_id,
                 self.generation,
                 Some(self.remote_storage.clone()),
+                self.deletion_queue.new_client(),
             ));
             tenant
                 .load(None, ctx)
@@ -4193,7 +4225,8 @@ mod tests {
     //
     #[tokio::test]
     async fn test_bulk_insert() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_bulk_insert")?.load().await;
+        let harness = TenantHarness::create("test_bulk_insert")?;
+        let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -4240,7 +4273,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_random_updates() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_random_updates")?.load().await;
+        let harness = TenantHarness::create("test_random_updates")?;
+        let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 74faee1115..6f3863dd4b 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -20,7 +20,10 @@ use utils::crashsafe;
 
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::control_plane_client::ControlPlaneClient;
+use crate::control_plane_client::{
+    ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
+};
+use crate::deletion_queue::DeletionQueueClient;
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::delete::DeleteTenantFlow;
@@ -116,7 +119,23 @@ pub async fn init_tenant_mgr(
 
     // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
     let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf, &cancel) {
-        Some(client.re_attach().await?)
+        let result = match client.re_attach().await {
+            Ok(tenants) => tenants,
+            Err(RetryForeverError::ShuttingDown) => {
+                anyhow::bail!("Shut down while waiting for control plane re-attach response")
+            }
+        };
+
+        // The deletion queue needs to know about the startup attachment state to decide which (if any) stored
+        // deletion list entries may still be valid.  We provide that by pushing a recovery operation into
+        // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
+        // are processed, even though we don't block on recovery completing here.
+        resources
+            .deletion_queue_client
+            .recover(result.clone())
+            .await?;
+
+        Some(result)
     } else {
         info!("Control plane API not configured, tenant generations are disabled");
         None
@@ -285,29 +304,21 @@ pub(crate) fn schedule_local_tenant_processing(
 
     let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
         info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
-        if let Some(remote_storage) = resources.remote_storage {
-            match Tenant::spawn_attach(
-                conf,
-                tenant_id,
-                generation,
-                resources.broker_client,
-                tenants,
-                remote_storage,
-                ctx,
-            ) {
-                Ok(tenant) => tenant,
-                Err(e) => {
-                    error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
-                    Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
-                }
-            }
-        } else {
+        if resources.remote_storage.is_none() {
             warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
             Tenant::create_broken_tenant(
                 conf,
                 tenant_id,
                 "attaching mark file present but no remote storage configured".to_string(),
             )
+        } else {
+            match Tenant::spawn_attach(conf, tenant_id, generation, resources, tenants, ctx) {
+                Ok(tenant) => tenant,
+                Err(e) => {
+                    error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
+                    Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
+                }
+            }
         }
     } else {
         info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
@@ -438,8 +449,7 @@ pub async fn create_tenant(
     tenant_conf: TenantConfOpt,
     tenant_id: TenantId,
     generation: Generation,
-    broker_client: storage_broker::BrokerClientChannel,
-    remote_storage: Option<GenericRemoteStorage>,
+    resources: TenantSharedResources,
     ctx: &RequestContext,
 ) -> Result<Arc<Tenant>, TenantMapInsertError> {
     tenant_map_insert(tenant_id, || async {
@@ -450,13 +460,9 @@ pub async fn create_tenant(
         // TODO: tenant directory remains on disk if we bail out from here on.
         //       See https://github.com/neondatabase/neon/issues/4233
 
-        let tenant_resources = TenantSharedResources {
-            broker_client,
-            remote_storage,
-        };
         let created_tenant =
             schedule_local_tenant_processing(conf, tenant_id, &tenant_directory,
-                generation, tenant_resources, None, &TENANTS, ctx)?;
+                generation, resources, None, &TENANTS, ctx)?;
         // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
         //      See https://github.com/neondatabase/neon/issues/4233
 
@@ -622,6 +628,7 @@ pub async fn load_tenant(
     generation: Generation,
     broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
+    deletion_queue_client: DeletionQueueClient,
     ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
     tenant_map_insert(tenant_id, || async {
@@ -635,6 +642,7 @@ pub async fn load_tenant(
         let resources = TenantSharedResources {
             broker_client,
             remote_storage,
+            deletion_queue_client
         };
         let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, generation, resources, None,  &TENANTS, ctx)
             .with_context(|| {
@@ -702,8 +710,7 @@ pub async fn attach_tenant(
     tenant_id: TenantId,
     generation: Generation,
     tenant_conf: TenantConfOpt,
-    broker_client: storage_broker::BrokerClientChannel,
-    remote_storage: GenericRemoteStorage,
+    resources: TenantSharedResources,
     ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
     tenant_map_insert(tenant_id, || async {
@@ -718,10 +725,7 @@ pub async fn attach_tenant(
             .context("check for attach marker file existence")?;
         anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
 
-        let resources = TenantSharedResources {
-            broker_client,
-            remote_storage: Some(remote_storage),
-        };
+
         let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, generation, resources, None, &TENANTS, ctx)?;
         // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
         //      See https://github.com/neondatabase/neon/issues/4233
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 6f42b54ac2..4e495d9bb2 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -116,8 +116,12 @@
 //! # Completion
 //!
 //! Once an operation has completed, we update
-//! [`UploadQueueInitialized::last_uploaded_consistent_lsn`] which indicates
-//! to safekeepers that they can delete the WAL up to that LSN.
+//! [`UploadQueueInitialized::projected_remote_consistent_lsn`] immediately,
+//! and submit a request through the DeletionQueue to update
+//! [`UploadQueueInitialized::visible_remote_consistent_lsn`] after it has
+//! validated that our generation is not stale.  It is this visible value
+//! that is advertized to safekeepers as a signal that that they can
+//! delete the WAL up to that LSN.
 //!
 //! The [`RemoteTimelineClient::wait_completion`] method can be used to wait
 //! for all pending operations to complete. It does not prevent more
@@ -200,7 +204,6 @@
 //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
 //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
 
-mod delete;
 mod download;
 pub mod index;
 mod upload;
@@ -226,6 +229,7 @@ use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;
 
+use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{
     MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
     RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
@@ -324,6 +328,8 @@ pub struct RemoteTimelineClient {
     metrics: Arc<RemoteTimelineClientMetrics>,
 
     storage_impl: GenericRemoteStorage,
+
+    deletion_queue_client: DeletionQueueClient,
 }
 
 impl RemoteTimelineClient {
@@ -335,6 +341,7 @@ impl RemoteTimelineClient {
     ///
     pub fn new(
         remote_storage: GenericRemoteStorage,
+        deletion_queue_client: DeletionQueueClient,
         conf: &'static PageServerConf,
         tenant_id: TenantId,
         timeline_id: TimelineId,
@@ -352,6 +359,7 @@ impl RemoteTimelineClient {
             timeline_id,
             generation,
             storage_impl: remote_storage,
+            deletion_queue_client,
             upload_queue: Mutex::new(UploadQueue::Uninitialized),
             metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
         }
@@ -413,13 +421,24 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
-    pub fn last_uploaded_consistent_lsn(&self) -> Option<Lsn> {
-        match &*self.upload_queue.lock().unwrap() {
+    pub fn remote_consistent_lsn_projected(&self) -> Option<Lsn> {
+        match &mut *self.upload_queue.lock().unwrap() {
             UploadQueue::Uninitialized => None,
-            UploadQueue::Initialized(q) => Some(q.last_uploaded_consistent_lsn),
-            UploadQueue::Stopped(q) => {
-                Some(q.upload_queue_for_deletion.last_uploaded_consistent_lsn)
-            }
+            UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(),
+            UploadQueue::Stopped(q) => q
+                .upload_queue_for_deletion
+                .get_last_remote_consistent_lsn_projected(),
+        }
+    }
+
+    pub fn remote_consistent_lsn_visible(&self) -> Option<Lsn> {
+        match &mut *self.upload_queue.lock().unwrap() {
+            UploadQueue::Uninitialized => None,
+            UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()),
+            UploadQueue::Stopped(q) => Some(
+                q.upload_queue_for_deletion
+                    .get_last_remote_consistent_lsn_visible(),
+            ),
         }
     }
 
@@ -643,7 +662,7 @@ impl RemoteTimelineClient {
     /// successfully.
     pub fn schedule_layer_file_deletion(
         self: &Arc<Self>,
-        names: &[LayerFileName],
+        names: Vec<LayerFileName>,
     ) -> anyhow::Result<()> {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
@@ -663,10 +682,10 @@ impl RemoteTimelineClient {
             // Decorate our list of names with each name's generation, dropping
             // makes that are unexpectedly missing from our metadata.
             let with_generations: Vec<_> = names
-                .iter()
+                .into_iter()
                 .filter_map(|name| {
                     // Remove from latest_files, learning the file's remote generation in the process
-                    let meta = upload_queue.latest_files.remove(name);
+                    let meta = upload_queue.latest_files.remove(&name);
 
                     if let Some(meta) = meta {
                         upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
@@ -688,19 +707,17 @@ impl RemoteTimelineClient {
                 self.schedule_index_upload(upload_queue, metadata);
             }
 
-            // schedule the actual deletions
-            for (name, generation) in with_generations {
-                let op = UploadOp::Delete(Delete {
-                    file_kind: RemoteOpFileKind::Layer,
-                    layer_file_name: name.clone(),
-                    scheduled_from_timeline_delete: false,
-                    generation,
-                });
-                self.calls_unfinished_metric_begin(&op);
-                upload_queue.queued_operations.push_back(op);
-                info!("scheduled layer file deletion {name}");
+            for (name, gen) in &with_generations {
+                info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
             }
 
+            // schedule the actual deletions
+            let op = UploadOp::Delete(Delete {
+                layers: with_generations,
+            });
+            self.calls_unfinished_metric_begin(&op);
+            upload_queue.queued_operations.push_back(op);
+
             // Launch the tasks immediately, if possible
             self.launch_queued_tasks(upload_queue);
         };
@@ -833,9 +850,7 @@ impl RemoteTimelineClient {
     pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
-        let (mut receiver, deletions_queued) = {
-            let mut deletions_queued = 0;
-
+        let layers: Vec<RemotePath> = {
             let mut locked = self.upload_queue.lock().unwrap();
             let stopped = locked.stopped_mut()?;
 
@@ -847,42 +862,30 @@ impl RemoteTimelineClient {
 
             stopped
                 .upload_queue_for_deletion
-                .queued_operations
-                .reserve(stopped.upload_queue_for_deletion.latest_files.len());
-
-            // schedule the actual deletions
-            for (name, meta) in &stopped.upload_queue_for_deletion.latest_files {
-                let op = UploadOp::Delete(Delete {
-                    file_kind: RemoteOpFileKind::Layer,
-                    layer_file_name: name.clone(),
-                    scheduled_from_timeline_delete: true,
-                    generation: meta.generation,
-                });
-
-                self.calls_unfinished_metric_begin(&op);
-                stopped
-                    .upload_queue_for_deletion
-                    .queued_operations
-                    .push_back(op);
-
-                info!("scheduled layer file deletion {name}");
-                deletions_queued += 1;
-            }
-
-            self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion);
-
-            (
-                self.schedule_barrier(&mut stopped.upload_queue_for_deletion),
-                deletions_queued,
-            )
+                .latest_files
+                .drain()
+                .map(|(file_name, meta)| {
+                    remote_layer_path(
+                        &self.tenant_id,
+                        &self.timeline_id,
+                        &file_name,
+                        meta.generation,
+                    )
+                })
+                .collect()
         };
 
-        receiver.changed().await.context("upload queue shut down")?;
+        let layer_deletion_count = layers.len();
+        self.deletion_queue_client.push_immediate(layers).await?;
 
         // Do not delete index part yet, it is needed for possible retry. If we remove it first
         // and retry will arrive to different pageserver there wont be any traces of it on remote storage
         let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id);
 
+        // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
+        // taking the burden of listing all the layers that we already know we should delete.
+        self.deletion_queue_client.flush_immediate().await?;
+
         let remaining = backoff::retry(
             || async {
                 self.storage_impl
@@ -910,17 +913,9 @@ impl RemoteTimelineClient {
             })
             .collect();
 
+        let not_referenced_count = remaining.len();
         if !remaining.is_empty() {
-            backoff::retry(
-                || async { self.storage_impl.delete_objects(&remaining).await },
-                |_e| false,
-                FAILED_UPLOAD_WARN_THRESHOLD,
-                FAILED_REMOTE_OP_RETRIES,
-                "delete_objects",
-                backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
-            )
-            .await
-            .context("delete_objects")?;
+            self.deletion_queue_client.push_immediate(remaining).await?;
         }
 
         fail::fail_point!("timeline-delete-before-index-delete", |_| {
@@ -931,18 +926,14 @@ impl RemoteTimelineClient {
 
         let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
 
-        debug!("deleting index part");
+        debug!("enqueuing index part deletion");
+        self.deletion_queue_client
+            .push_immediate([index_file_path].to_vec())
+            .await?;
 
-        backoff::retry(
-            || async { self.storage_impl.delete(&index_file_path).await },
-            |_e| false,
-            FAILED_UPLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
-            "delete_index",
-            backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled")),
-        )
-        .await
-        .context("delete_index")?;
+        // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
+        // for a flush to a persistent deletion list so that we may be sure deletion will occur.
+        self.deletion_queue_client.flush_immediate().await?;
 
         fail::fail_point!("timeline-delete-after-index-delete", |_| {
             Err(anyhow::anyhow!(
@@ -950,7 +941,7 @@ impl RemoteTimelineClient {
             ))?
         });
 
-        info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");
+        info!(prefix=%timeline_storage_path, referenced=layer_deletion_count, not_referenced=%not_referenced_count, "done deleting in timeline prefix, including index_part.json");
 
         Ok(())
     }
@@ -1140,21 +1131,16 @@ impl RemoteTimelineClient {
                     }
                     res
                 }
-                UploadOp::Delete(delete) => {
-                    let path = &self
-                        .conf
-                        .timeline_path(&self.tenant_id, &self.timeline_id)
-                        .join(delete.layer_file_name.file_name());
-                    delete::delete_layer(self.conf, &self.storage_impl, path, delete.generation)
-                        .measure_remote_op(
-                            self.tenant_id,
-                            self.timeline_id,
-                            delete.file_kind,
-                            RemoteOpKind::Delete,
-                            Arc::clone(&self.metrics),
-                        )
-                        .await
-                }
+                UploadOp::Delete(delete) => self
+                    .deletion_queue_client
+                    .push_layers(
+                        self.tenant_id,
+                        self.timeline_id,
+                        self.generation,
+                        delete.layers.clone(),
+                    )
+                    .await
+                    .map_err(|e| anyhow::anyhow!(e)),
                 UploadOp::Barrier(_) => {
                     // unreachable. Barrier operations are handled synchronously in
                     // launch_queued_tasks
@@ -1210,18 +1196,12 @@ impl RemoteTimelineClient {
         }
 
         // The task has completed successfully. Remove it from the in-progress list.
-        {
+        let lsn_update = {
             let mut upload_queue_guard = self.upload_queue.lock().unwrap();
             let upload_queue = match upload_queue_guard.deref_mut() {
                 UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"),
-                UploadQueue::Stopped(stopped) => {
-                    // Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion)
-                    // then stop() took care of it so we just return.
-                    // For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc.
-                    match &task.op {
-                        UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion),
-                        _ => None
-                    }
+                UploadQueue::Stopped(_stopped) => {
+                    None
                 },
                 UploadQueue::Initialized(qi) => { Some(qi) }
             };
@@ -1236,23 +1216,51 @@ impl RemoteTimelineClient {
 
             upload_queue.inprogress_tasks.remove(&task.task_id);
 
-            match task.op {
+            let lsn_update = match task.op {
                 UploadOp::UploadLayer(_, _) => {
                     upload_queue.num_inprogress_layer_uploads -= 1;
+                    None
                 }
                 UploadOp::UploadMetadata(_, lsn) => {
                     upload_queue.num_inprogress_metadata_uploads -= 1;
-                    upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check?
+                    // XXX monotonicity check?
+
+                    upload_queue.projected_remote_consistent_lsn = Some(lsn);
+                    if self.generation.is_none() {
+                        // Legacy mode: skip validating generation
+                        upload_queue.visible_remote_consistent_lsn.store(lsn);
+                        None
+                    } else {
+                        Some((lsn, upload_queue.visible_remote_consistent_lsn.clone()))
+                    }
                 }
                 UploadOp::Delete(_) => {
                     upload_queue.num_inprogress_deletions -= 1;
+                    None
                 }
                 UploadOp::Barrier(_) => unreachable!(),
             };
 
             // Launch any queued tasks that were unblocked by this one.
             self.launch_queued_tasks(upload_queue);
+            lsn_update
+        };
+
+        if let Some((lsn, slot)) = lsn_update {
+            // Updates to the remote_consistent_lsn we advertise to pageservers
+            // are all routed through the DeletionQueue, to enforce important
+            // data safety guarantees (see docs/rfcs/025-generation-numbers.md)
+            self.deletion_queue_client
+                .update_remote_consistent_lsn(
+                    self.tenant_id,
+                    self.timeline_id,
+                    self.generation,
+                    lsn,
+                    slot,
+                )
+                .await;
         }
+
         self.calls_unfinished_metric_end(&task.op);
     }
 
@@ -1278,8 +1286,8 @@ impl RemoteTimelineClient {
                     reason: "metadata uploads are tiny",
                 },
             ),
-            UploadOp::Delete(delete) => (
-                delete.file_kind,
+            UploadOp::Delete(_delete) => (
+                RemoteOpFileKind::Layer,
                 RemoteOpKind::Delete,
                 DontTrackSize {
                     reason: "should we track deletes? positive or negative sign?",
@@ -1341,7 +1349,10 @@ impl RemoteTimelineClient {
                         latest_files: initialized.latest_files.clone(),
                         latest_files_changes_since_metadata_upload_scheduled: 0,
                         latest_metadata: initialized.latest_metadata.clone(),
-                        last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn,
+                        projected_remote_consistent_lsn: None,
+                        visible_remote_consistent_lsn: initialized
+                            .visible_remote_consistent_lsn
+                            .clone(),
                         num_inprogress_layer_uploads: 0,
                         num_inprogress_metadata_uploads: 0,
                         num_inprogress_deletions: 0,
@@ -1405,13 +1416,13 @@ pub fn remote_layer_path(
     tenant_id: &TenantId,
     timeline_id: &TimelineId,
     layer_file_name: &LayerFileName,
-    layer_meta: &LayerFileMetadata,
+    generation: Generation,
 ) -> RemotePath {
     // Generation-aware key format
     let path = format!(
         "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
         layer_file_name.file_name(),
-        layer_meta.generation.get_suffix()
+        generation.get_suffix()
     );
 
     RemotePath::from_string(&path).expect("Failed to construct path")
@@ -1554,7 +1565,6 @@ mod tests {
 
     impl TestSetup {
         async fn new(test_name: &str) -> anyhow::Result<Self> {
-            // Use a current-thread runtime in the test
             let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
             let harness = TenantHarness::create(test_name)?;
             let (tenant, ctx) = harness.load().await;
@@ -1580,6 +1590,7 @@ mod tests {
                 timeline_id: TIMELINE_ID,
                 generation,
                 storage_impl: self.harness.remote_storage.clone(),
+                deletion_queue_client: self.harness.deletion_queue.new_client(),
                 upload_queue: Mutex::new(UploadQueue::Uninitialized),
                 metrics: Arc::new(RemoteTimelineClientMetrics::new(
                     &self.harness.tenant_id,
@@ -1749,7 +1760,7 @@ mod tests {
             )
             .unwrap();
         client
-            .schedule_layer_file_deletion(&[layer_file_name_1.clone()])
+            .schedule_layer_file_deletion([layer_file_name_1.clone()].to_vec())
             .unwrap();
         {
             let mut guard = client.upload_queue.lock().unwrap();
@@ -1775,6 +1786,7 @@ mod tests {
 
         // Finish them
         client.wait_completion().await.unwrap();
+        harness.deletion_queue.pump().await;
 
         assert_remote_files(
             &[
diff --git a/pageserver/src/tenant/remote_timeline_client/delete.rs b/pageserver/src/tenant/remote_timeline_client/delete.rs
deleted file mode 100644
index 7324559223..0000000000
--- a/pageserver/src/tenant/remote_timeline_client/delete.rs
+++ /dev/null
@@ -1,34 +0,0 @@
-//! Helper functions to delete files from remote storage with a RemoteStorage
-use anyhow::Context;
-use std::path::Path;
-use tracing::debug;
-
-use remote_storage::GenericRemoteStorage;
-
-use crate::{
-    config::PageServerConf,
-    tenant::{remote_timeline_client::remote_path, Generation},
-};
-
-pub(super) async fn delete_layer<'a>(
-    conf: &'static PageServerConf,
-    storage: &'a GenericRemoteStorage,
-    local_layer_path: &'a Path,
-    generation: Generation,
-) -> anyhow::Result<()> {
-    fail::fail_point!("before-delete-layer", |_| {
-        anyhow::bail!("failpoint before-delete-layer")
-    });
-    debug!("Deleting layer from remote storage: {local_layer_path:?}",);
-
-    let path_to_delete = remote_path(conf, local_layer_path, generation)?;
-
-    // We don't want to print an error if the delete failed if the file has
-    // already been deleted. Thankfully, in this situation S3 already
-    // does not yield an error. While OS-provided local file system APIs do yield
-    // errors, we avoid them in the `LocalFs` wrapper.
-    storage
-        .delete(&path_to_delete)
-        .await
-        .with_context(|| format!("delete remote layer from storage at {path_to_delete:?}"))
-}
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 9863215529..5c173c613f 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -50,7 +50,12 @@ pub async fn download_layer_file<'a>(
         .timeline_path(&tenant_id, &timeline_id)
         .join(layer_file_name.file_name());
 
-    let remote_path = remote_layer_path(&tenant_id, &timeline_id, layer_file_name, layer_metadata);
+    let remote_path = remote_layer_path(
+        &tenant_id,
+        &timeline_id,
+        layer_file_name,
+        layer_metadata.generation,
+    );
 
     // Perform a rename inspired by durable_rename from file_utils.c.
     // The sequence:
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 78ac1338db..4fa5039d79 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -38,6 +38,7 @@ use std::time::{Duration, Instant, SystemTime};
 use crate::context::{
     AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
 };
+use crate::deletion_queue::DeletionQueueClient;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
@@ -143,6 +144,7 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
     pub remote_client: Option<RemoteTimelineClient>,
+    pub deletion_queue_client: DeletionQueueClient,
 }
 
 pub struct Timeline {
@@ -521,9 +523,23 @@ impl Timeline {
         self.disk_consistent_lsn.load()
     }
 
-    pub fn get_remote_consistent_lsn(&self) -> Option<Lsn> {
+    /// remote_consistent_lsn from the perspective of the tenant's current generation,
+    /// not validated with control plane yet.
+    /// See [`Self::get_remote_consistent_lsn_visible`].
+    pub fn get_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
         if let Some(remote_client) = &self.remote_client {
-            remote_client.last_uploaded_consistent_lsn()
+            remote_client.remote_consistent_lsn_projected()
+        } else {
+            None
+        }
+    }
+
+    /// remote_consistent_lsn which the tenant is guaranteed not to go backward from,
+    /// i.e. a value of remote_consistent_lsn_projected which has undergone
+    /// generation validation in the deletion queue.
+    pub fn get_remote_consistent_lsn_visible(&self) -> Option<Lsn> {
+        if let Some(remote_client) = &self.remote_client {
+            remote_client.remote_consistent_lsn_visible()
         } else {
             None
         }
@@ -1820,7 +1836,7 @@ impl Timeline {
             for (layer, m) in needs_upload {
                 rtc.schedule_layer_file_upload(&layer.layer_desc().filename(), &m)?;
             }
-            rtc.schedule_layer_file_deletion(&needs_cleanup)?;
+            rtc.schedule_layer_file_deletion(needs_cleanup)?;
             rtc.schedule_index_upload_for_file_changes()?;
             // Tenant::create_timeline will wait for these uploads to happen before returning, or
             // on retry.
@@ -3875,7 +3891,7 @@ impl Timeline {
 
         // Also schedule the deletions in remote storage
         if let Some(remote_client) = &self.remote_client {
-            remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
+            remote_client.schedule_layer_file_deletion(layer_names_to_delete)?;
         }
 
         Ok(())
@@ -4210,7 +4226,7 @@ impl Timeline {
             }
 
             if let Some(remote_client) = &self.remote_client {
-                remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
+                remote_client.schedule_layer_file_deletion(layer_names_to_delete)?;
             }
 
             apply.flush();
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 18588cf0fd..7d55388f44 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -14,6 +14,7 @@ use utils::{
 
 use crate::{
     config::PageServerConf,
+    deletion_queue::DeletionQueueClient,
     task_mgr::{self, TaskKind},
     tenant::{
         metadata::TimelineMetadata,
@@ -407,6 +408,7 @@ impl DeleteTimelineFlow {
         timeline_id: TimelineId,
         local_metadata: &TimelineMetadata,
         remote_client: Option<RemoteTimelineClient>,
+        deletion_queue_client: DeletionQueueClient,
         init_order: Option<&InitializationOrder>,
     ) -> anyhow::Result<()> {
         // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
@@ -416,7 +418,10 @@ impl DeleteTimelineFlow {
                 timeline_id,
                 local_metadata,
                 None, // Ancestor is not needed for deletion.
-                TimelineResources { remote_client },
+                TimelineResources {
+                    remote_client,
+                    deletion_queue_client,
+                },
                 init_order,
                 // Important. We dont pass ancestor above because it can be missing.
                 // Thus we need to skip the validation here.
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 7d1e9b4a39..0831b9ceda 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -370,8 +370,9 @@ pub(super) async fn handle_walreceiver_connection(
             })?;
 
         if let Some(last_lsn) = status_update {
-            let timeline_remote_consistent_lsn =
-                timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
+            let timeline_remote_consistent_lsn = timeline
+                .get_remote_consistent_lsn_visible()
+                .unwrap_or(Lsn(0));
 
             // The last LSN we processed. It is not guaranteed to survive pageserver crash.
             let last_received_lsn = last_lsn;
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index 28822335b0..08b1cb8866 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -1,5 +1,3 @@
-use crate::metrics::RemoteOpFileKind;
-
 use super::storage_layer::LayerFileName;
 use super::Generation;
 use crate::tenant::metadata::TimelineMetadata;
@@ -11,6 +9,7 @@ use std::fmt::Debug;
 use chrono::NaiveDateTime;
 use std::sync::Arc;
 use tracing::info;
+use utils::lsn::AtomicLsn;
 
 use std::sync::atomic::AtomicU32;
 use utils::lsn::Lsn;
@@ -58,7 +57,12 @@ pub(crate) struct UploadQueueInitialized {
     /// uploaded. `Lsn(0)` if nothing was uploaded yet.
     /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
     /// Safekeeper can rely on it to make decisions for WAL storage.
-    pub(crate) last_uploaded_consistent_lsn: Lsn,
+    ///
+    /// visible_remote_consistent_lsn is only updated after our generation has been validated with
+    /// the control plane (unlesss a timeline's generation is None, in which case
+    /// we skip validation)
+    pub(crate) projected_remote_consistent_lsn: Option<Lsn>,
+    pub(crate) visible_remote_consistent_lsn: Arc<AtomicLsn>,
 
     // Breakdown of different kinds of tasks currently in-progress
     pub(crate) num_inprogress_layer_uploads: usize,
@@ -81,6 +85,14 @@ impl UploadQueueInitialized {
     pub(super) fn no_pending_work(&self) -> bool {
         self.inprogress_tasks.is_empty() && self.queued_operations.is_empty()
     }
+
+    pub(super) fn get_last_remote_consistent_lsn_visible(&self) -> Lsn {
+        self.visible_remote_consistent_lsn.load()
+    }
+
+    pub(super) fn get_last_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
+        self.projected_remote_consistent_lsn
+    }
 }
 
 #[derive(Clone, Copy)]
@@ -114,9 +126,8 @@ impl UploadQueue {
             latest_files: HashMap::new(),
             latest_files_changes_since_metadata_upload_scheduled: 0,
             latest_metadata: metadata.clone(),
-            // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
-            // safekeepers from garbage-collecting anything.
-            last_uploaded_consistent_lsn: Lsn(0),
+            projected_remote_consistent_lsn: None,
+            visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
             // what follows are boring default initializations
             task_counter: 0,
             num_inprogress_layer_uploads: 0,
@@ -158,7 +169,10 @@ impl UploadQueue {
             latest_files: files,
             latest_files_changes_since_metadata_upload_scheduled: 0,
             latest_metadata: index_part.metadata.clone(),
-            last_uploaded_consistent_lsn: index_part.metadata.disk_consistent_lsn(),
+            projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()),
+            visible_remote_consistent_lsn: Arc::new(
+                index_part.metadata.disk_consistent_lsn().into(),
+            ),
             // what follows are boring default initializations
             task_counter: 0,
             num_inprogress_layer_uploads: 0,
@@ -201,12 +215,11 @@ pub(crate) struct UploadTask {
     pub(crate) op: UploadOp,
 }
 
+/// A deletion of some layers within the lifetime of a timeline.  This is not used
+/// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
 #[derive(Debug)]
 pub(crate) struct Delete {
-    pub(crate) file_kind: RemoteOpFileKind,
-    pub(crate) layer_file_name: LayerFileName,
-    pub(crate) scheduled_from_timeline_delete: bool,
-    pub(crate) generation: Generation,
+    pub(crate) layers: Vec<(LayerFileName, Generation)>,
 }
 
 #[derive(Debug)]
@@ -217,7 +230,7 @@ pub(crate) enum UploadOp {
     /// Upload the metadata file
     UploadMetadata(IndexPart, Lsn),
 
-    /// Delete a layer file
+    /// Delete layer files
     Delete(Delete),
 
     /// Barrier. When the barrier operation is reached,
@@ -239,13 +252,9 @@ impl std::fmt::Display for UploadOp {
             UploadOp::UploadMetadata(_, lsn) => {
                 write!(f, "UploadMetadata(lsn: {})", lsn)
             }
-            UploadOp::Delete(delete) => write!(
-                f,
-                "Delete(path: {}, scheduled_from_timeline_delete: {}, gen: {:?})",
-                delete.layer_file_name.file_name(),
-                delete.scheduled_from_timeline_delete,
-                delete.generation
-            ),
+            UploadOp::Delete(delete) => {
+                write!(f, "Delete({} layers)", delete.layers.len(),)
+            }
             UploadOp::Barrier(_) => write!(f, "Barrier"),
         }
     }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0667403ba3..38d0aeb960 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1481,6 +1481,16 @@ class NeonAttachmentService:
             self.running = False
         return self
 
+    def attach_hook(self, tenant_id: TenantId, pageserver_id: int) -> int:
+        response = requests.post(
+            f"{self.env.control_plane_api}/attach_hook",
+            json={"tenant_id": str(tenant_id), "pageserver_id": pageserver_id},
+        )
+        response.raise_for_status()
+        gen = response.json()["gen"]
+        assert isinstance(gen, int)
+        return gen
+
     def __enter__(self) -> "NeonAttachmentService":
         return self
 
@@ -1689,12 +1699,7 @@ class NeonPageserver(PgProtocol):
         to call into the pageserver HTTP client.
         """
         if self.env.attachment_service is not None:
-            response = requests.post(
-                f"{self.env.control_plane_api}/attach_hook",
-                json={"tenant_id": str(tenant_id), "pageserver_id": self.id},
-            )
-            response.raise_for_status()
-            generation = response.json()["gen"]
+            generation = self.env.attachment_service.attach_hook(tenant_id, self.id)
         else:
             generation = None
 
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 9373073abf..9fdcd22bc2 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -620,3 +620,8 @@ class PageserverHttpClient(requests.Session):
             },
         )
         self.verbose_error(res)
+
+    def deletion_queue_flush(self, execute: bool = False):
+        self.put(
+            f"http://localhost:{self.port}/v1/deletion_queue/flush?execute={'true' if execute else 'false'}"
+        ).raise_for_status()
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 2e5d75a0fc..70c2a06a07 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -267,7 +267,7 @@ def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional
 
 
 def list_prefix(
-    neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None
+    neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None, delimiter: str = "/"
 ) -> ListObjectsV2OutputTypeDef:
     """
     Note that this function takes into account prefix_in_bucket.
@@ -287,7 +287,7 @@ def list_prefix(
 
     # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
     response = remote.client.list_objects_v2(
-        Delimiter="/",
+        Delimiter=delimiter,
         Bucket=remote.bucket_name,
         Prefix=prefix,
     )
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
new file mode 100644
index 0000000000..81d38ac934
--- /dev/null
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -0,0 +1,352 @@
+"""
+
+Tests in this module exercise the pageserver's behavior around generation numbers,
+as defined in docs/rfcs/025-generation-numbers.md.  Briefly, the behaviors we require
+of the pageserver are:
+- Do not start a tenant without a generation number if control_plane_api is set
+- Remote objects must be suffixed with generation
+- Deletions may only be executed after validating generation
+- Updates to remote_consistent_lsn may only be made visible after validating generation
+"""
+
+
+import re
+import time
+from typing import Optional
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+    last_flush_lsn_upload,
+    wait_for_last_flush_lsn,
+)
+from fixtures.pageserver.utils import list_prefix
+from fixtures.remote_storage import (
+    RemoteStorageKind,
+)
+from fixtures.types import TenantId, TimelineId
+from fixtures.utils import print_gc_result, wait_until
+
+# A tenant configuration that is convenient for generating uploads and deletions
+# without a large amount of postgres traffic.
+TENANT_CONF = {
+    # small checkpointing and compaction targets to ensure we generate many upload operations
+    "checkpoint_distance": f"{128 * 1024}",
+    "compaction_threshold": "1",
+    "compaction_target_size": f"{128 * 1024}",
+    # no PITR horizon, we specify the horizon when we request on-demand GC
+    "pitr_interval": "0s",
+    # disable background compaction and GC. We invoke it manually when we want it to happen.
+    "gc_period": "0s",
+    "compaction_period": "0s",
+    # create image layers eagerly, so that GC can remove some layers
+    "image_creation_threshold": "1",
+}
+
+
+def generate_uploads_and_deletions(
+    env: NeonEnv,
+    *,
+    init: bool = True,
+    tenant_id: Optional[TenantId] = None,
+    timeline_id: Optional[TimelineId] = None,
+    data: Optional[str] = None,
+):
+    """
+    Using the environment's default tenant + timeline, generate a load pattern
+    that results in some uploads and some deletions to remote storage.
+    """
+
+    if tenant_id is None:
+        tenant_id = env.initial_tenant
+    assert tenant_id is not None
+
+    if timeline_id is None:
+        timeline_id = env.initial_timeline
+    assert timeline_id is not None
+
+    ps_http = env.pageserver.http_client()
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        if init:
+            endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
+            last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+
+        def churn(data):
+            endpoint.safe_psql_many(
+                [
+                    f"""
+                INSERT INTO foo (id, val)
+                SELECT g, '{data}'
+                FROM generate_series(1, 20000) g
+                ON CONFLICT (id) DO UPDATE
+                SET val = EXCLUDED.val
+                """,
+                    # to ensure that GC can actually remove some layers
+                    "VACUUM foo",
+                ]
+            )
+            assert tenant_id is not None
+            assert timeline_id is not None
+            wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+            ps_http.timeline_checkpoint(tenant_id, timeline_id)
+
+        # Compaction should generate some GC-elegible layers
+        for i in range(0, 2):
+            churn(f"{i if data is None else data}")
+
+        gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0)
+        print_gc_result(gc_result)
+        assert gc_result["layers_removed"] > 0
+
+
+def get_metric_or_0(ps_http, metric: str) -> int:
+    v = ps_http.get_metric_value(metric)
+    return 0 if v is None else int(v)
+
+
+def get_deletion_queue_executed(ps_http) -> int:
+    return get_metric_or_0(ps_http, "pageserver_deletion_queue_executed_total")
+
+
+def get_deletion_queue_submitted(ps_http) -> int:
+    return get_metric_or_0(ps_http, "pageserver_deletion_queue_submitted_total")
+
+
+def get_deletion_queue_dropped(ps_http) -> int:
+    return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_total")
+
+
+def get_deletion_queue_unexpected_errors(ps_http) -> int:
+    return get_metric_or_0(ps_http, "pageserver_deletion_queue_unexpected_errors_total")
+
+
+def get_deletion_queue_dropped_lsn_updates(ps_http) -> int:
+    return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_lsn_updates_total")
+
+
+def get_deletion_queue_depth(ps_http) -> int:
+    """
+    Queue depth if at least one deletion has been submitted, else None
+    """
+    submitted = get_deletion_queue_submitted(ps_http)
+    executed = get_deletion_queue_executed(ps_http)
+    dropped = get_deletion_queue_dropped(ps_http)
+    depth = submitted - executed - dropped
+    log.info(f"get_deletion_queue_depth: {depth} ({submitted} - {executed} - {dropped})")
+
+    assert depth >= 0
+    return int(depth)
+
+
+def assert_deletion_queue(ps_http, size_fn) -> None:
+    v = get_deletion_queue_depth(ps_http)
+    assert v is not None
+    assert size_fn(v) is True
+
+
+def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
+    """
+    Validate behavior when a pageserver is run without generation support enabled,
+    then started again after activating it:
+    - Before upgrade, no objects should have generation suffixes
+    - After upgrade, the bucket should contain a mixture.
+    - In both cases, postgres I/O should work.
+    """
+    neon_env_builder.enable_generations = True
+    neon_env_builder.enable_pageserver_remote_storage(
+        RemoteStorageKind.MOCK_S3,
+    )
+
+    env = neon_env_builder.init_configs()
+    env.broker.try_start()
+    for sk in env.safekeepers:
+        sk.start()
+    assert env.attachment_service is not None
+    env.attachment_service.start()
+
+    env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
+
+    env.neon_cli.create_tenant(
+        tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline
+    )
+    generate_uploads_and_deletions(env)
+
+    def parse_generation_suffix(key):
+        m = re.match(".+-([0-9a-zA-Z]{8})$", key)
+        if m is None:
+            return None
+        else:
+            log.info(f"match: {m}")
+            log.info(f"group: {m.group(1)}")
+            return int(m.group(1), 16)
+
+    pre_upgrade_keys = list(
+        [o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]]
+    )
+    for key in pre_upgrade_keys:
+        assert parse_generation_suffix(key) is None
+
+    env.pageserver.stop()
+
+    # Starting without the override that disabled control_plane_api
+    env.pageserver.start()
+
+    generate_uploads_and_deletions(env, init=False)
+
+    legacy_objects: list[str] = []
+    suffixed_objects = []
+    post_upgrade_keys = list(
+        [o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]]
+    )
+    for key in post_upgrade_keys:
+        log.info(f"post-upgrade key: {key}")
+        if parse_generation_suffix(key) is not None:
+            suffixed_objects.append(key)
+        else:
+            legacy_objects.append(key)
+
+    # Bucket now contains a mixture of suffixed and non-suffixed objects
+    assert len(suffixed_objects) > 0
+    assert len(legacy_objects) > 0
+
+    assert get_deletion_queue_unexpected_errors(env.pageserver.http_client()) == 0
+
+
+def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_generations = True
+    neon_env_builder.enable_pageserver_remote_storage(
+        RemoteStorageKind.MOCK_S3,
+    )
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+    assert env.attachment_service is not None
+
+    some_other_pageserver = 1234
+    ps_http = env.pageserver.http_client()
+
+    generate_uploads_and_deletions(env)
+
+    # Flush: pending deletions should all complete
+    assert_deletion_queue(ps_http, lambda n: n > 0)
+    ps_http.deletion_queue_flush(execute=True)
+    assert_deletion_queue(ps_http, lambda n: n == 0)
+    assert get_deletion_queue_dropped(ps_http) == 0
+
+    # Our visible remote_consistent_lsn should match projected
+    timeline = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
+    assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"]
+    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
+
+    env.pageserver.allowed_errors.extend(
+        [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
+    )
+
+    # Now advance the generation in the control plane: subsequent validations
+    # from the running pageserver will fail.  No more deletions should happen.
+    env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver)
+    generate_uploads_and_deletions(env, init=False)
+
+    assert_deletion_queue(ps_http, lambda n: n > 0)
+    queue_depth_before = get_deletion_queue_depth(ps_http)
+    executed_before = get_deletion_queue_executed(ps_http)
+    ps_http.deletion_queue_flush(execute=True)
+
+    # Queue drains to zero because we dropped deletions
+    assert_deletion_queue(ps_http, lambda n: n == 0)
+    # The executed counter has not incremented
+    assert get_deletion_queue_executed(ps_http) == executed_before
+    # The dropped counter has incremented to consume all of the deletions that were previously enqueued
+    assert get_deletion_queue_dropped(ps_http) == queue_depth_before
+
+    # Flush to S3 and see that remote_consistent_lsn does not advance: it cannot
+    # because generation validation fails.
+    timeline = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
+    assert timeline["remote_consistent_lsn"] != timeline["remote_consistent_lsn_visible"]
+    assert get_deletion_queue_dropped_lsn_updates(ps_http) > 0
+
+    # TODO: list bucket and confirm all objects have a generation suffix.
+
+    assert get_deletion_queue_unexpected_errors(ps_http) == 0
+
+
+@pytest.mark.parametrize("keep_attachment", [True, False])
+def test_deletion_queue_recovery(
+    neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, keep_attachment: bool
+):
+    """
+    :param keep_attachment: If true, we re-attach after restart.  Else, we act as if some other
+    node took the attachment while we were restarting.
+    """
+    neon_env_builder.enable_generations = True
+    neon_env_builder.enable_pageserver_remote_storage(
+        RemoteStorageKind.MOCK_S3,
+    )
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+
+    ps_http = env.pageserver.http_client()
+
+    # Prevent deletion lists from being executed, to build up some backlog of deletions
+    ps_http.configure_failpoints(
+        [
+            ("deletion-queue-before-execute", "return"),
+        ]
+    )
+
+    generate_uploads_and_deletions(env)
+
+    # There should be entries in the deletion queue
+    assert_deletion_queue(ps_http, lambda n: n > 0)
+    ps_http.deletion_queue_flush()
+    before_restart_depth = get_deletion_queue_depth(ps_http)
+
+    assert get_deletion_queue_unexpected_errors(ps_http) == 0
+    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
+
+    log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
+    env.pageserver.stop(immediate=True)
+
+    if not keep_attachment:
+        some_other_pageserver = 101010
+        assert env.attachment_service is not None
+        env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver)
+
+    env.pageserver.start()
+
+    def assert_deletions_submitted(n: int):
+        assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n
+
+    # After restart, issue a flush to kick the deletion frontend to do recovery.
+    # It should recover all the operations we submitted before the restart.
+    ps_http.deletion_queue_flush(execute=False)
+    wait_until(20, 0.25, lambda: assert_deletions_submitted(before_restart_depth))
+
+    # The queue should drain through completely if we flush it
+    ps_http.deletion_queue_flush(execute=True)
+    wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0))
+
+    if keep_attachment:
+        # If we kept the attachment, then our pre-restart deletions should have executed
+        # successfully
+        assert get_deletion_queue_executed(ps_http) == before_restart_depth
+    else:
+        # If we lost the attachment, we should have dropped our pre-restart deletions.
+        assert get_deletion_queue_dropped(ps_http) == before_restart_depth
+        env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
+
+    assert get_deletion_queue_unexpected_errors(ps_http) == 0
+    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
+
+    # Restart again
+    env.pageserver.stop(immediate=True)
+    env.pageserver.start()
+
+    # No deletion lists should be recovered: this demonstrates that deletion lists
+    # were cleaned up after being executed or dropped in the previous process lifetime.
+    time.sleep(1)
+    assert_deletion_queue(ps_http, lambda n: n == 0)
+
+    assert get_deletion_queue_unexpected_errors(ps_http) == 0
+    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index c6ddb54ee6..9d0d42a4ef 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -43,6 +43,12 @@ def test_tenant_delete_smoke(
     neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
 
     env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.extend(
+        [
+            # The deletion queue will complain when it encounters simulated S3 errors
+            ".*deletion executor: DeleteObjects request failed.*",
+        ]
+    )
 
     # lucky race with stopping from flushing a layer we fail to schedule any uploads
     env.pageserver.allowed_errors.append(
@@ -195,6 +201,14 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
         ]
     )
 
+    if simulate_failures:
+        env.pageserver.allowed_errors.extend(
+            [
+                # The deletion queue will complain when it encounters simulated S3 errors
+                ".*deletion executor: DeleteObjects request failed.*",
+            ]
+        )
+
     ps_http = env.pageserver.http_client()
 
     timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id)
@@ -383,6 +397,7 @@ def test_tenant_delete_is_resumed_on_attach(
     assert not tenant_path.exists()
 
     if remote_storage_kind in available_s3_storages():
+        ps_http.deletion_queue_flush(execute=True)
         assert_prefix_empty(
             neon_env_builder,
             prefix="/".join(
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 0e4df21d83..839df69240 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -807,6 +807,8 @@ def test_delete_orphaned_objects(
     reason = timeline_info["state"]["Broken"]["reason"]
     assert reason.endswith(f"failpoint: {failpoint}"), reason
 
+    ps_http.deletion_queue_flush(execute=True)
+
     for orphan in orphans:
         assert not orphan.exists()
         assert env.pageserver.log_contains(

From 6cc8c31fd844ce60dbd3ba8d3ccff96aea7bd82d Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 27 Sep 2023 12:00:21 +0300
Subject: [PATCH 03/24] disk_usage_based_eviction: switch warmup to use full
 table scans (#5384)

Fixes #3978. `test_partial_evict_tenant` can fail multiple times so even
though we retry it as flaky, it will still haunt us.

Originally was going to just relax the comparison, then ended up
replacing warming up to use full table scans instead of `pgbench
--select-only`. This seems to help by producing the expected layer
accesses. There might be something off with how many layers pg16
produces compared to pg14 and pg15. Created #5392.
---
 .../regress/test_disk_usage_eviction.py       | 104 +++++++++++++-----
 1 file changed, 76 insertions(+), 28 deletions(-)

diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index ae62fdf4a4..f3f3a1ddf3 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -74,11 +74,13 @@ class EvictionEnv:
     pgbench_init_lsns: Dict[TenantId, Lsn]
 
     def timelines_du(self) -> Tuple[int, int, int]:
-        return poor_mans_du(self.neon_env, [(tid, tlid) for tid, tlid in self.timelines])
+        return poor_mans_du(
+            self.neon_env, [(tid, tlid) for tid, tlid in self.timelines], verbose=False
+        )
 
     def du_by_timeline(self) -> Dict[Tuple[TenantId, TimelineId], int]:
         return {
-            (tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)])[0]
+            (tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)], verbose=True)[0]
             for tid, tlid in self.timelines
         }
 
@@ -89,7 +91,21 @@ class EvictionEnv:
         """
         lsn = self.pgbench_init_lsns[tenant_id]
         with self.neon_env.endpoints.create_start("main", tenant_id=tenant_id, lsn=lsn) as endpoint:
-            self.pg_bin.run(["pgbench", "-S", endpoint.connstr()])
+            # instead of using pgbench --select-only which does point selects,
+            # run full table scans for all tables
+            with endpoint.connect() as conn:
+                cur = conn.cursor()
+
+                tables_cols = {
+                    "pgbench_accounts": "abalance",
+                    "pgbench_tellers": "tbalance",
+                    "pgbench_branches": "bbalance",
+                    "pgbench_history": "delta",
+                }
+
+                for table, column in tables_cols.items():
+                    cur.execute(f"select avg({column}) from {table}")
+                    _avg = cur.fetchone()
 
     def pageserver_start_with_disk_usage_eviction(
         self, period, max_usage_pct, min_avail_bytes, mock_behavior
@@ -127,6 +143,19 @@ class EvictionEnv:
         self.neon_env.pageserver.allowed_errors.append(".*WARN.* disk usage still high.*")
 
 
+def human_bytes(amt: float) -> str:
+    suffixes = ["", "Ki", "Mi", "Gi"]
+
+    last = suffixes[-1]
+
+    for name in suffixes:
+        if amt < 1024 or name == last:
+            return f"{int(round(amt))} {name}B"
+        amt = amt / 1024
+
+    raise RuntimeError("unreachable")
+
+
 @pytest.fixture
 def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv:
     """
@@ -215,8 +244,12 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
 
     healthy_tenant_id, healthy_timeline_id = env.timelines[1]
 
-    broken_size_pre, _, _ = poor_mans_du(env.neon_env, [(broken_tenant_id, broken_timeline_id)])
-    healthy_size_pre, _, _ = poor_mans_du(env.neon_env, [(healthy_tenant_id, healthy_timeline_id)])
+    broken_size_pre, _, _ = poor_mans_du(
+        env.neon_env, [(broken_tenant_id, broken_timeline_id)], verbose=True
+    )
+    healthy_size_pre, _, _ = poor_mans_du(
+        env.neon_env, [(healthy_tenant_id, healthy_timeline_id)], verbose=True
+    )
 
     # try to evict everything, then validate that broken tenant wasn't touched
     target = broken_size_pre + healthy_size_pre
@@ -224,8 +257,12 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
     response = env.pageserver_http.disk_usage_eviction_run({"evict_bytes": target})
     log.info(f"{response}")
 
-    broken_size_post, _, _ = poor_mans_du(env.neon_env, [(broken_tenant_id, broken_timeline_id)])
-    healthy_size_post, _, _ = poor_mans_du(env.neon_env, [(healthy_tenant_id, healthy_timeline_id)])
+    broken_size_post, _, _ = poor_mans_du(
+        env.neon_env, [(broken_tenant_id, broken_timeline_id)], verbose=True
+    )
+    healthy_size_post, _, _ = poor_mans_du(
+        env.neon_env, [(healthy_tenant_id, healthy_timeline_id)], verbose=True
+    )
 
     assert broken_size_pre == broken_size_post, "broken tenant should not be touched"
     assert healthy_size_post < healthy_size_pre
@@ -366,18 +403,16 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
     du_by_timeline = env.du_by_timeline()
 
     # pick any tenant
-    [our_tenant, other_tenant] = list(du_by_timeline.keys())
-    (tenant_id, timeline_id) = our_tenant
+    [warm, cold] = list(du_by_timeline.keys())
+    (tenant_id, timeline_id) = warm
 
-    # make our tenant more recently used than the other one
+    # make picked tenant more recently used than the other one
     env.warm_up_tenant(tenant_id)
 
     # Build up enough pressure to require evictions from both tenants,
     # but not enough to fall into global LRU.
-    # So, set target to all occipied space, except 2*env.layer_size per tenant
-    target = (
-        du_by_timeline[other_tenant] + (du_by_timeline[our_tenant] // 2) - 2 * 2 * env.layer_size
-    )
+    # So, set target to all occupied space, except 2*env.layer_size per tenant
+    target = du_by_timeline[cold] + (du_by_timeline[warm] // 2) - 2 * 2 * env.layer_size
     response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
     log.info(f"{response}")
 
@@ -392,22 +427,33 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
             later_tenant_usage < du_by_timeline[tenant]
         ), "all tenants should have lost some layers"
 
+    warm_size = later_du_by_timeline[warm]
+
+    # bounds for warmed_size
+    warm_lower = 0.5 * du_by_timeline[warm]
+
+    # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
+    # So, check for up to 3 here.
+    warm_upper = warm_lower + 3 * env.layer_size
+
+    cold_size = later_du_by_timeline[cold]
+    cold_upper = 2 * env.layer_size
+
+    log.info(
+        f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}"
+    )
+    log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}")
+
+    assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)"
+    assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)"
+
     assert (
-        later_du_by_timeline[our_tenant] > 0.5 * du_by_timeline[our_tenant]
-    ), "our warmed up tenant should be at about half capacity, part 1"
-    assert (
-        # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
-        # So, check for up to 3 here.
-        later_du_by_timeline[our_tenant]
-        < 0.5 * du_by_timeline[our_tenant] + 3 * env.layer_size
-    ), "our warmed up tenant should be at about half capacity, part 2"
-    assert (
-        later_du_by_timeline[other_tenant] < 2 * env.layer_size
-    ), "the other tenant should be evicted to is min_resident_size, i.e., max layer file size"
+        cold_size < cold_upper
+    ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size"
 
 
 def poor_mans_du(
-    env: NeonEnv, timelines: list[Tuple[TenantId, TimelineId]]
+    env: NeonEnv, timelines: list[Tuple[TenantId, TimelineId]], verbose: bool = False
 ) -> Tuple[int, int, int]:
     """
     Disk usage, largest, smallest layer for layer files over the given (tenant, timeline) tuples;
@@ -430,9 +476,11 @@ def poor_mans_du(
                 smallest_layer = min(smallest_layer, size)
             else:
                 smallest_layer = size
-            log.info(f"{tenant_id}/{timeline_id} => {file.name} {size}")
+            if verbose:
+                log.info(f"{tenant_id}/{timeline_id} => {file.name} {size} ({human_bytes(size)})")
 
-        log.info(f"{tenant_id}/{timeline_id}: sum {total}")
+        if verbose:
+            log.info(f"{tenant_id}/{timeline_id}: sum {total} ({human_bytes(total)})")
         total_on_disk += total
 
     assert smallest_layer is not None or total_on_disk == 0 and largest_layer == 0

From ce45fd4cc72b5873179223ccbfc2b2be8ad2ca14 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 27 Sep 2023 14:00:49 +0300
Subject: [PATCH 04/24] test_pageserver_metric_collection: allowed synthetic
 size to be cancelled at shutdown (#5398)

[evidence] of these messages during shutdown. They can happen if we are
unlucky enough.

[evidence]:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/6323709725/index.html#suites/e557ea0d920cfebd45c1921296031273/4120269a64eed172
---
 .../test_pageserver_metric_collection.py      | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py
index dae39d2752..74e016a9df 100644
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -18,6 +18,9 @@ from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
 
 
+# TODO: collect all of the env setup *AFTER* removal of RemoteStorageKind.NOOP
+
+
 @pytest.mark.parametrize(
     "remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.LOCAL_FS]
 )
@@ -68,6 +71,14 @@ def test_metric_collection(
     env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
     # httpserver is shut down before pageserver during passing run
     env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
+    # we have a fast rate of calculation, these can happen at shutdown
+    env.pageserver.allowed_errors.append(
+        ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*"
+    )
+    env.pageserver.allowed_errors.append(
+        ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes"
+    )
+
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
     endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
@@ -211,6 +222,14 @@ def test_metric_collection_cleans_up_tempfile(
 
     # httpserver is shut down before pageserver during passing run
     env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
+    # we have a fast rate of calculation, these can happen at shutdown
+    env.pageserver.allowed_errors.append(
+        ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*"
+    )
+    env.pageserver.allowed_errors.append(
+        ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes"
+    )
+
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
     endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)

From 7038ce40ce8c251d4d50b81fe54132bce708a586 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Wed, 27 Sep 2023 13:48:30 +0200
Subject: [PATCH 05/24] Fix neon_zeroextend's WAL logging (#5387)

When you log more than a few blocks, you need to reserve the space in
advance. We didn't do that, so we got errors. Now we do that, and
shouldn't get errors.
---
 pgxn/neon/pagestore_smgr.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 919bca03e9..2e4364cbfa 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1790,6 +1790,14 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	if (!XLogInsertAllowed())
 		return;
 
+	/* ensure we have enough xlog buffers to log max-sized records */
+	XLogEnsureRecordSpace(Min(remblocks, (XLR_MAX_BLOCK_ID - 1)), 0);
+
+	/*
+	 * Iterate over all the pages. They are collected into batches of
+	 * XLR_MAX_BLOCK_ID pages, and a single WAL-record is written for each
+	 * batch.
+	 */
 	while (remblocks > 0)
 	{
 		int			count = Min(remblocks, XLR_MAX_BLOCK_ID);

From 2cced770da14ecba956968fcdf1f0d1a080d5b84 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 27 Sep 2023 13:12:13 +0100
Subject: [PATCH 06/24] pageserver: add control_plane_api_token config (#5383)

## Problem

Control plane API calls in prod will need authentication.

## Summary of changes

`control_plane_api_token` config is loaded and set as HTTP
`Authorization` header.

Closes: https://github.com/neondatabase/neon/issues/5139
---
 libs/utils/src/logging.rs              | 18 ++++++++++++++++++
 pageserver/src/config.rs               | 16 ++++++++++++++--
 pageserver/src/control_plane_client.rs | 12 ++++++++----
 3 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index f69c0603d5..7f17970c4c 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -216,6 +216,24 @@ impl std::fmt::Debug for PrettyLocation<'_, '_> {
     }
 }
 
+/// When you will store a secret but want to make sure it won't
+/// be accidentally logged, wrap it in a SecretString, whose Debug
+/// implementation does not expose the contents.
+#[derive(Clone, Eq, PartialEq)]
+pub struct SecretString(String);
+
+impl SecretString {
+    pub fn get_contents(&self) -> &str {
+        self.0.as_str()
+    }
+}
+
+impl std::fmt::Debug for SecretString {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "[SECRET]")
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use metrics::{core::Opts, IntCounterVec};
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index ed767b764e..c3f2f14a74 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -11,6 +11,7 @@ use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
+use utils::logging::SecretString;
 
 use once_cell::sync::OnceCell;
 use reqwest::Url;
@@ -207,6 +208,9 @@ pub struct PageServerConf {
     pub background_task_maximum_delay: Duration,
 
     pub control_plane_api: Option<Url>,
+
+    /// JWT token for use with the control plane API.
+    pub control_plane_api_token: Option<SecretString>,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -283,6 +287,7 @@ struct PageServerConfigBuilder {
     background_task_maximum_delay: BuilderValue<Duration>,
 
     control_plane_api: BuilderValue<Option<Url>>,
+    control_plane_api_token: BuilderValue<Option<SecretString>>,
 }
 
 impl Default for PageServerConfigBuilder {
@@ -347,6 +352,7 @@ impl Default for PageServerConfigBuilder {
             .unwrap()),
 
             control_plane_api: Set(None),
+            control_plane_api_token: Set(None),
         }
     }
 }
@@ -567,6 +573,9 @@ impl PageServerConfigBuilder {
             control_plane_api: self
                 .control_plane_api
                 .ok_or(anyhow!("missing control_plane_api"))?,
+            control_plane_api_token: self
+                .control_plane_api_token
+                .ok_or(anyhow!("missing control_plane_api_token"))?,
         })
     }
 }
@@ -945,6 +954,7 @@ impl PageServerConf {
             ondemand_download_behavior_treat_error_as_warn: false,
             background_task_maximum_delay: Duration::ZERO,
             control_plane_api: None,
+            control_plane_api_token: None,
         }
     }
 }
@@ -1168,7 +1178,8 @@ background_task_maximum_delay = '334 s'
                 background_task_maximum_delay: humantime::parse_duration(
                     defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
                 )?,
-                control_plane_api: None
+                control_plane_api: None,
+                control_plane_api_token: None
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1224,7 +1235,8 @@ background_task_maximum_delay = '334 s'
                 test_remote_failures: 0,
                 ondemand_download_behavior_treat_error_as_warn: false,
                 background_task_maximum_delay: Duration::from_secs(334),
-                control_plane_api: None
+                control_plane_api: None,
+                control_plane_api_token: None
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index 555f76e523..3375392373 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -53,12 +53,16 @@ impl ControlPlaneClient {
             segs.pop_if_empty().push("");
         }
 
-        let client = reqwest::ClientBuilder::new()
-            .build()
-            .expect("Failed to construct http client");
+        let mut client = reqwest::ClientBuilder::new();
+
+        if let Some(jwt) = &conf.control_plane_api_token {
+            let mut headers = hyper::HeaderMap::new();
+            headers.insert("Authorization", jwt.get_contents().parse().unwrap());
+            client = client.default_headers(headers);
+        }
 
         Some(Self {
-            http_client: client,
+            http_client: client.build().expect("Failed to construct HTTP client"),
             base_url: url,
             node_id: conf.id,
             cancel: cancel.clone(),

From 090a6443929915ed92924b6ae2077d0b229f118d Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 27 Sep 2023 14:18:05 +0200
Subject: [PATCH 07/24] metrics for resident & remote physical size without
 tenant/timeline dimension (#5389)

So that we can compute worst-case /storage size dashboard panel more
cheaply.
---
 pageserver/src/metrics.rs                     | 97 +++++++++++++++++--
 .../src/tenant/remote_timeline_client.rs      |  4 +-
 pageserver/src/tenant/timeline.rs             | 13 +--
 .../src/tenant/timeline/layer_manager.rs      |  2 +-
 4 files changed, 94 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index b085176f18..de94eb8152 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -291,6 +291,14 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_resident_physical_size_global",
+        "Like `pageserver_resident_physical_size`, but without tenant/timeline dimensions."
+    )
+    .expect("failed to define a metric")
+});
+
 static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_remote_physical_size",
@@ -301,6 +309,14 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static REMOTE_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_remote_physical_size_global",
+        "Like `pageserver_remote_physical_size`, but without tenant/timeline dimensions."
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "pageserver_remote_ondemand_downloaded_layers_total",
@@ -1209,7 +1225,7 @@ pub struct TimelineMetrics {
     pub load_layer_map_histo: StorageTimeMetrics,
     pub garbage_collect_histo: StorageTimeMetrics,
     pub last_record_gauge: IntGauge,
-    pub resident_physical_size_gauge: UIntGauge,
+    resident_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
     pub current_logical_size_gauge: UIntGauge,
     pub num_persistent_files_created: IntCounter,
@@ -1287,10 +1303,29 @@ impl TimelineMetrics {
     }
 
     pub fn record_new_file_metrics(&self, sz: u64) {
-        self.resident_physical_size_gauge.add(sz);
+        self.resident_physical_size_add(sz);
         self.num_persistent_files_created.inc_by(1);
         self.persistent_bytes_written.inc_by(sz);
     }
+
+    pub fn resident_physical_size_sub(&self, sz: u64) {
+        self.resident_physical_size_gauge.sub(sz);
+        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
+    }
+
+    pub fn resident_physical_size_add(&self, sz: u64) {
+        self.resident_physical_size_gauge.add(sz);
+        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
+    }
+
+    pub fn resident_physical_size_set(&self, sz: u64) {
+        self.resident_physical_size_gauge.set(sz);
+        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.set(sz);
+    }
+
+    pub fn resident_physical_size_get(&self) -> u64 {
+        self.resident_physical_size_gauge.get()
+    }
 }
 
 impl Drop for TimelineMetrics {
@@ -1298,7 +1333,10 @@ impl Drop for TimelineMetrics {
         let tenant_id = &self.tenant_id;
         let timeline_id = &self.timeline_id;
         let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+        {
+            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
+            let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+        }
         let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
         let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
         let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
@@ -1352,10 +1390,43 @@ use std::time::{Duration, Instant};
 use crate::context::{PageContentKind, RequestContext};
 use crate::task_mgr::TaskKind;
 
+/// Maintain a per timeline gauge in addition to the global gauge.
+struct PerTimelineRemotePhysicalSizeGauge {
+    last_set: u64,
+    gauge: UIntGauge,
+}
+
+impl PerTimelineRemotePhysicalSizeGauge {
+    fn new(per_timeline_gauge: UIntGauge) -> Self {
+        Self {
+            last_set: per_timeline_gauge.get(),
+            gauge: per_timeline_gauge,
+        }
+    }
+    fn set(&mut self, sz: u64) {
+        self.gauge.set(sz);
+        if sz < self.last_set {
+            REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set - sz);
+        } else {
+            REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - self.last_set);
+        };
+        self.last_set = sz;
+    }
+    fn get(&self) -> u64 {
+        self.gauge.get()
+    }
+}
+
+impl Drop for PerTimelineRemotePhysicalSizeGauge {
+    fn drop(&mut self) {
+        REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set);
+    }
+}
+
 pub struct RemoteTimelineClientMetrics {
     tenant_id: String,
     timeline_id: String,
-    remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
+    remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
     calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
     bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
     bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
@@ -1373,18 +1444,24 @@ impl RemoteTimelineClientMetrics {
         }
     }
 
-    pub fn remote_physical_size_gauge(&self) -> UIntGauge {
+    pub(crate) fn remote_physical_size_set(&self, sz: u64) {
         let mut guard = self.remote_physical_size_gauge.lock().unwrap();
-        guard
-            .get_or_insert_with(|| {
+        let gauge = guard.get_or_insert_with(|| {
+            PerTimelineRemotePhysicalSizeGauge::new(
                 REMOTE_PHYSICAL_SIZE
                     .get_metric_with_label_values(&[
                         &self.tenant_id.to_string(),
                         &self.timeline_id.to_string(),
                     ])
-                    .unwrap()
-            })
-            .clone()
+                    .unwrap(),
+            )
+        });
+        gauge.set(sz);
+    }
+
+    pub(crate) fn remote_physical_size_get(&self) -> u64 {
+        let guard = self.remote_physical_size_gauge.lock().unwrap();
+        guard.as_ref().map(|gauge| gauge.get()).unwrap_or(0)
     }
 
     pub fn remote_operation_time(
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 4e495d9bb2..ee99151ef2 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -453,11 +453,11 @@ impl RemoteTimelineClient {
         } else {
             0
         };
-        self.metrics.remote_physical_size_gauge().set(size);
+        self.metrics.remote_physical_size_set(size);
     }
 
     pub fn get_remote_physical_size(&self) -> u64 {
-        self.metrics.remote_physical_size_gauge().get()
+        self.metrics.remote_physical_size_get()
     }
 
     //
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4fa5039d79..9b62ba1c50 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -559,7 +559,7 @@ impl Timeline {
     }
 
     pub fn resident_physical_size(&self) -> u64 {
-        self.metrics.resident_physical_size_gauge.get()
+        self.metrics.resident_physical_size_get()
     }
 
     ///
@@ -1309,10 +1309,7 @@ impl Timeline {
         // will treat the file as a local layer again, count it towards resident size,
         // and it'll be like the layer removal never happened.
         // The bump in resident size is perhaps unexpected but overall a robust behavior.
-        self.metrics
-            .resident_physical_size_gauge
-            .sub(layer_file_size);
-
+        self.metrics.resident_physical_size_sub(layer_file_size);
         self.metrics.evictions.inc();
 
         if let Some(delta) = local_layer_residence_duration {
@@ -1846,9 +1843,7 @@ impl Timeline {
             "loaded layer map with {} layers at {}, total physical size: {}",
             num_layers, disk_consistent_lsn, total_physical_size
         );
-        self.metrics
-            .resident_physical_size_gauge
-            .set(total_physical_size);
+        self.metrics.resident_physical_size_set(total_physical_size);
 
         timer.stop_and_record();
         Ok(())
@@ -4398,7 +4393,7 @@ impl Timeline {
 
                     // XXX the temp file is still around in Err() case
                     // and consumes space until we clean up upon pageserver restart.
-                    self_clone.metrics.resident_physical_size_gauge.add(*size);
+                    self_clone.metrics.resident_physical_size_add(*size);
 
                     // Download complete. Replace the RemoteLayer with the corresponding
                     // Delta- or ImageLayer in the layer map.
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 3c88d31f24..0a387bd779 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -263,7 +263,7 @@ impl LayerManager {
         let desc = layer.layer_desc();
         if !layer.is_remote_layer() {
             layer.delete_resident_layer_file()?;
-            metrics.resident_physical_size_gauge.sub(desc.file_size);
+            metrics.resident_physical_size_sub(desc.file_size);
         }
 
         // TODO Removing from the bottom of the layer map is expensive.

From 48e85460fc9b1d0ea14d767778a9f12059ff8ea4 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Wed, 27 Sep 2023 21:27:23 -0700
Subject: [PATCH 08/24] vm-monitor: Unset memory.high on start + refactor
 cgroup handling (#5348)

## Problem

Over the past couple days, we've had a couple VMs hit issues with
postgres getting hit by memory.high throttling, even after #5303 was
supposed to fix that. The tl;dr of those issues is that because
vm-monitor startup sets the file cache size first, before interacting
with the cgroup, cgroup throttling can mean we timeout connecting to the
file cache and never reset the cgroup, even if memory has been upscaled
since then.

See e.g.:

- https://neondb.slack.com/archives/C03F5SM1N02/p1695218132208249
- https://neondb.slack.com/archives/C03F5SM1N02/p1695314613696659

## Summary of changes

This PR adds an additional step into vm-monitor startup, where we first
set the cgroup's memory.high value to 'max', removing the capacity for
throttling. This preferable to just setting memory.high before the file
cache, because it's theoretically possible that the new value of
memory.high could still be less than the current memory usage, in which
case postgres could continue to be throttled without sufficient memory
events to relieve that.

Implementing this properly involved adding a method to our internal
cgroup interface, and it seemed like there was duplicated functionality
there, so this PR unifies that as well, making things a bit more
consistent.
---
 libs/vm_monitor/src/cgroup.rs | 59 ++++++---------------
 libs/vm_monitor/src/runner.rs | 99 +++++++++++++++++++++--------------
 2 files changed, 74 insertions(+), 84 deletions(-)

diff --git a/libs/vm_monitor/src/cgroup.rs b/libs/vm_monitor/src/cgroup.rs
index 3254fa4501..15e972505e 100644
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -431,14 +431,14 @@ impl CgroupWatcher {
                             .context("failed to request upscale")?;
 
                         let memory_high =
-                            self.get_high_bytes().context("failed to get memory.high")?;
+                            self.get_memory_high_bytes().context("failed to get memory.high")?;
                         let new_high = memory_high + self.config.memory_high_increase_by_bytes;
                         info!(
                             current_high_bytes = memory_high,
                             new_high_bytes = new_high,
                             "updating memory.high"
                         );
-                        self.set_high_bytes(new_high)
+                        self.set_memory_high_bytes(new_high)
                             .context("failed to set memory.high")?;
                         last_memory_high_increase_at = Some(Instant::now());
                         continue;
@@ -556,14 +556,6 @@ impl CgroupWatcher {
     }
 }
 
-/// Represents a set of limits we apply to a cgroup to control memory usage.
-///
-/// Setting these values also affects the thresholds for receiving usage alerts.
-#[derive(Debug)]
-pub struct MemoryLimits {
-    pub high: u64,
-}
-
 // Methods for manipulating the actual cgroup
 impl CgroupWatcher {
     /// Get a handle on the freezer subsystem.
@@ -624,50 +616,29 @@ impl CgroupWatcher {
     }
 
     /// Set cgroup memory.high threshold.
-    pub fn set_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
+    pub fn set_memory_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
+        self.set_memory_high_internal(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64))
+    }
+
+    /// Set the cgroup's memory.high to 'max', disabling it.
+    pub fn unset_memory_high(&self) -> anyhow::Result<()> {
+        self.set_memory_high_internal(MaxValue::Max)
+    }
+
+    fn set_memory_high_internal(&self, value: MaxValue) -> anyhow::Result<()> {
         self.memory()
             .context("failed to get memory subsystem")?
             .set_mem(cgroups_rs::memory::SetMemory {
                 low: None,
-                high: Some(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64)),
+                high: Some(value),
                 min: None,
                 max: None,
             })
-            .context("failed to set memory.high")
-    }
-
-    /// Set cgroup memory.high and memory.max.
-    pub fn set_limits(&self, limits: &MemoryLimits) -> anyhow::Result<()> {
-        info!(limits.high, path = self.path(), "writing new memory limits",);
-        self.memory()
-            .context("failed to get memory subsystem while setting memory limits")?
-            .set_mem(cgroups_rs::memory::SetMemory {
-                min: None,
-                low: None,
-                high: Some(MaxValue::Value(
-                    u64::min(limits.high, i64::MAX as u64) as i64
-                )),
-                max: None,
-            })
-            .context("failed to set memory limits")
-    }
-
-    /// Given some amount of available memory, set the desired cgroup memory limits
-    pub fn set_memory_limits(&mut self, available_memory: u64) -> anyhow::Result<()> {
-        let new_high = self.config.calculate_memory_high_value(available_memory);
-        let limits = MemoryLimits { high: new_high };
-        info!(
-            path = self.path(),
-            memory = ?limits,
-            "setting cgroup memory",
-        );
-        self.set_limits(&limits)
-            .context("failed to set cgroup memory limits")?;
-        Ok(())
+            .map_err(anyhow::Error::from)
     }
 
     /// Get memory.high threshold.
-    pub fn get_high_bytes(&self) -> anyhow::Result<u64> {
+    pub fn get_memory_high_bytes(&self) -> anyhow::Result<u64> {
         let high = self
             .memory()
             .context("failed to get memory subsystem while getting memory statistics")?
diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs
index 376017d784..09863c8936 100644
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -16,7 +16,7 @@ use tokio::sync::mpsc;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn};
 
-use crate::cgroup::{CgroupWatcher, MemoryLimits, Sequenced};
+use crate::cgroup::{CgroupWatcher, Sequenced};
 use crate::dispatcher::Dispatcher;
 use crate::filecache::{FileCacheConfig, FileCacheState};
 use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
@@ -106,6 +106,51 @@ impl Runner {
             kill,
         };
 
+        // If we have both the cgroup and file cache integrations enabled, it's possible for
+        // temporary failures to result in cgroup throttling (from memory.high), that in turn makes
+        // it near-impossible to connect to the file cache (because it times out). Unfortunately,
+        // we *do* still want to determine the file cache size before setting the cgroup's
+        // memory.high, so it's not as simple as just swapping the order.
+        //
+        // Instead, the resolution here is that on vm-monitor startup (note: happens on each
+        // connection from autoscaler-agent, possibly multiple times per compute_ctl lifecycle), we
+        // temporarily unset memory.high, to allow any existing throttling to dissipate. It's a bit
+        // of a hacky solution, but helps with reliability.
+        if let Some(name) = &args.cgroup {
+            // Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
+            // now, and then set limits later.
+            info!("initializing cgroup");
+
+            let (cgroup, cgroup_event_stream) = CgroupWatcher::new(name.clone(), requesting_send)
+                .context("failed to create cgroup manager")?;
+
+            info!("temporarily unsetting memory.high");
+
+            // Temporarily un-set cgroup memory.high; see above.
+            cgroup
+                .unset_memory_high()
+                .context("failed to unset memory.high")?;
+
+            let cgroup = Arc::new(cgroup);
+
+            let cgroup_clone = Arc::clone(&cgroup);
+            spawn_with_cancel(
+                token.clone(),
+                |_| error!("cgroup watcher terminated"),
+                async move { cgroup_clone.watch(notified_recv, cgroup_event_stream).await },
+            );
+
+            state.cgroup = Some(cgroup);
+        } else {
+            // *NOTE*: We need to forget the sender so that its drop impl does not get ran.
+            // This allows us to poll it in `Monitor::run` regardless of whether we
+            // are managing a cgroup or not. If we don't forget it, all receives will
+            // immediately return an error because the sender is droped and it will
+            // claim all select! statements, effectively turning `Monitor::run` into
+            // `loop { fail to receive }`.
+            mem::forget(requesting_send);
+        }
+
         let mut file_cache_reserved_bytes = 0;
         let mem = get_total_system_memory();
 
@@ -119,7 +164,7 @@ impl Runner {
                 false => FileCacheConfig::default_in_memory(),
             };
 
-            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
+            let mut file_cache = FileCacheState::new(connstr, config, token)
                 .await
                 .context("failed to create file cache")?;
 
@@ -152,35 +197,15 @@ impl Runner {
             state.filecache = Some(file_cache);
         }
 
-        if let Some(name) = &args.cgroup {
-            let (mut cgroup, cgroup_event_stream) =
-                CgroupWatcher::new(name.clone(), requesting_send)
-                    .context("failed to create cgroup manager")?;
-
+        if let Some(cgroup) = &state.cgroup {
             let available = mem - file_cache_reserved_bytes;
+            let value = cgroup.config.calculate_memory_high_value(available);
+
+            info!(value, "setting memory.high");
 
             cgroup
-                .set_memory_limits(available)
-                .context("failed to set cgroup memory limits")?;
-
-            let cgroup = Arc::new(cgroup);
-
-            // Some might call this . . . cgroup v2
-            let cgroup_clone = Arc::clone(&cgroup);
-
-            spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move {
-                cgroup_clone.watch(notified_recv, cgroup_event_stream).await
-            });
-
-            state.cgroup = Some(cgroup);
-        } else {
-            // *NOTE*: We need to forget the sender so that its drop impl does not get ran.
-            // This allows us to poll it in `Monitor::run` regardless of whether we
-            // are managing a cgroup or not. If we don't forget it, all receives will
-            // immediately return an error because the sender is droped and it will
-            // claim all select! statements, effectively turning `Monitor::run` into
-            // `loop { fail to receive }`.
-            mem::forget(requesting_send);
+                .set_memory_high_bytes(value)
+                .context("failed to set cgroup memory.high")?;
         }
 
         Ok(state)
@@ -257,14 +282,11 @@ impl Runner {
                 new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
             }
 
-            let limits = MemoryLimits {
-                // new_cgroup_mem_high is initialized to 0 but it is guarancontextd to not be here
-                // since it is properly initialized in the previous cgroup if let block
-                high: new_cgroup_mem_high,
-            };
+            // new_cgroup_mem_high is initialized to 0 but it is guaranteed to not be here
+            // since it is properly initialized in the previous cgroup if let block
             cgroup
-                .set_limits(&limits)
-                .context("failed to set cgroup memory limits")?;
+                .set_memory_high_bytes(new_cgroup_mem_high)
+                .context("failed to set cgroup memory.high")?;
 
             let message = format!(
                 "set cgroup memory.high to {} MiB, of new max {} MiB",
@@ -327,12 +349,9 @@ impl Runner {
                 name = cgroup.path(),
                 "updating cgroup memory.high",
             );
-            let limits = MemoryLimits {
-                high: new_cgroup_mem_high,
-            };
             cgroup
-                .set_limits(&limits)
-                .context("failed to set file cache size")?;
+                .set_memory_high_bytes(new_cgroup_mem_high)
+                .context("failed to set cgroup memory.high")?;
         }
 
         Ok(())

From 5fdc80db03c2bd70918a5ad7ea540d5ab1871712 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Thu, 28 Sep 2023 00:52:39 -0700
Subject: [PATCH 09/24] Bump vm-builder v0.17.11 -> v0.17.12 (#5407)

Only relevant change is neondatabase/autoscaling#534 - refer there for
more details.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 7271a8d29f..65a2101dc6 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -834,7 +834,7 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_BUILDER_VERSION: v0.17.11
+      VM_BUILDER_VERSION: v0.17.12
 
     steps:
       - name: Checkout

From 6b4bb91d0a039c17b418bf8c54af05c0734aca5f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 28 Sep 2023 10:07:11 +0100
Subject: [PATCH 10/24] docs/rfcs: add RFC for fast tenant migration/failover
 (#5029)

## Problem

Currently we don't have a way to migrate tenants from one pageserver to
another without a risk of gap in availability.

## Summary of changes

This follows on from https://github.com/neondatabase/neon/pull/4919

Migrating tenants between pageservers is essential to operating a
service
at scale, in several contexts:

1. Responding to a pageserver node failure by migrating tenants to other
pageservers
2. Balancing load and capacity across pageservers, for example when a
user expands their
   database and they need to migrate to a pageserver with more capacity.
3. Restarting pageservers for upgrades and maintenance

Currently, a tenant may migrated by attaching to a new node,
re-configuring endpoints to use the new node, and then later detaching
from the old node. This is safe once [generation
numbers](025-generation-numbers.md) are implemented, but does meet
our seamless/fast/efficient goals:

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 docs/rfcs/028-pageserver-migration.md | 599 ++++++++++++++++++++++++++
 1 file changed, 599 insertions(+)
 create mode 100644 docs/rfcs/028-pageserver-migration.md

diff --git a/docs/rfcs/028-pageserver-migration.md b/docs/rfcs/028-pageserver-migration.md
new file mode 100644
index 0000000000..f708f641aa
--- /dev/null
+++ b/docs/rfcs/028-pageserver-migration.md
@@ -0,0 +1,599 @@
+# Seamless tenant migration
+
+- Author: john@neon.tech
+- Created on 2023-08-11
+- Implemented on ..
+
+## Summary
+
+The preceding [generation numbers RFC](025-generation-numbers.md) may be thought of as "making tenant
+migration safe". Following that,
+this RFC is about how those migrations are to be done:
+
+1. Seamlessly (without interruption to client availability)
+2. Quickly (enabling faster operations)
+3. Efficiently (minimizing I/O and $ cost)
+
+These points are in priority order: if we have to sacrifice
+efficiency to make a migration seamless for clients, we will
+do so, etc.
+
+This is accomplished by introducing two high level changes:
+
+- A dual-attached state for tenants, used in a control-plane-orchestrated
+  migration procedure that preserves availability during a migration.
+- Warm secondary locations for tenants, where on-disk content is primed
+  for a fast migration of the tenant from its current attachment to this
+  secondary location.
+
+## Motivation
+
+Migrating tenants between pageservers is essential to operating a service
+at scale, in several contexts:
+
+1. Responding to a pageserver node failure by migrating tenants to other pageservers
+2. Balancing load and capacity across pageservers, for example when a user expands their
+   database and they need to migrate to a pageserver with more capacity.
+3. Restarting pageservers for upgrades and maintenance
+
+The current situation steps for migration are:
+
+- detach from old node; skip if old node is dead; (the [skip part is still WIP](https://github.com/neondatabase/cloud/issues/5426)).
+- attach to new node
+- re-configure endpoints to use the new node
+
+Once [generation numbers](025-generation-numbers.md) are implemented,
+the detach step is no longer critical for correctness. So, we can
+
+- attach to a new node,
+- re-configure endpoints to use the new node, and then
+- detach from the old node.
+
+However, this still does not meet our seamless/fast/efficient goals:
+
+- Not fast: The new node will have to download potentially large amounts
+  of data from S3, which may take many minutes.
+- Not seamless: If we attach to a new pageserver before detaching an old one,
+  the new one might delete some objects that interrupt availability of reads on the old one.
+- Not efficient: the old pageserver will continue uploading
+  S3 content during the migration that will never be read.
+
+The user expectations for availability are:
+
+- For planned maintenance, there should be zero availability
+  gap. This expectation is fulfilled by this RFC.
+- For unplanned changes (e.g. node failures), there should be
+  minimal availability gap. This RFC provides the _mechanism_
+  to fail over quickly, but does not provide the failure _detection_
+  nor failover _policy_.
+
+## Non Goals
+
+- Defining service tiers with different storage strategies: the same
+  level of HA & overhead will apply to all tenants. This doesn't rule out
+  adding such tiers in future.
+- Enabling pageserver failover in the absence of a control plane: the control
+  plane will remain the source of truth for what should be attached where.
+- Totally avoiding availability gaps on unplanned migrations during
+  a failure (we expect a small, bounded window of
+  read unavailability of very recent LSNs)
+- Workload balancing: this RFC defines the mechanism for moving tenants
+  around, not the higher level logic for deciding who goes where.
+- Defining all possible configuration flows for tenants: the migration process
+  defined in this RFC demonstrates the sufficiency of the pageserver API, but
+  is not the only kind of configuration change the control plane will ever do.
+  The APIs defined here should let the control plane move tenants around in
+  whatever way is needed while preserving data safety and read availability.
+
+## Impacted components
+
+Pageserver, control plane
+
+## Terminology
+
+- **Attachment**: a tenant is _attached_ to a pageserver if it has
+  been issued a generation number, and is running an instance of
+  the `Tenant` type, ingesting the WAL, and available to serve
+  page reads.
+- **Location**: locations are a superset of attachments. A location
+  is a combination of a tenant and a pageserver. We may _attach_ at a _location_.
+
+- **Secondary location**: a location which is not currently attached.
+- **Warm secondary location**: a location which is not currently attached, but is endeavoring to maintain a warm local cache of layers. We avoid calling this a _warm standby_ to avoid confusion with similar postgres features.
+
+## Implementation (high level)
+
+### Warm secondary locations
+
+To enable faster migrations, we will identify at least one _secondary location_
+for each tenant. This secondary location will keep a warm cache of layers
+for the tenant, so that if it is later attached, it can catch up with the
+latest LSN quickly: rather than downloading everything, it only has to replay
+the recent part of the WAL to advance from the remote_consistent_offset to the
+most recent LSN in the WAL.
+
+The control plane is responsible for selecting secondary locations, and
+calling into pageservers to configure tenants into a secondary mode at this
+new location, as well as attaching the tenant in its existing primary location.
+
+The attached pageserver for a tenant will publish a [layer heatmap](#layer-heatmap)
+to advise secondaries of which layers should be downloaded.
+
+### Location modes
+
+Currently, we consider a tenant to be in one of two states on a pageserver:
+
+- Attached: active `Tenant` object, and layers on local disk
+- Detached: no layers on local disk, no runtime state.
+
+We will extend this with finer-grained modes, whose purpose will become
+clear in later sections:
+
+- **AttachedSingle**: equivalent the existing attached state.
+- **AttachedMulti**: like AttachedSingle, holds an up to date generation, but
+  does not do deletions.
+- **AttachedStale**: like AttachedSingle, holds a stale generation,
+  do not do any remote storage operations.
+- **Secondary**: keep local state on disk, periodically update from S3.
+- **Detached**: equivalent to existing detached state.
+
+To control these finer grained states, a new pageserver API endpoint will be added.
+
+### Cutover procedure
+
+Define old location and new location as "Node A" and "Node B". Consider
+the case where both nodes are available, and Node B was previously configured
+as a secondary location for the tenant we are migrating.
+
+The cutover procedure is orchestrated by the control plane, calling into
+the pageservers' APIs:
+
+1. Call to Node A requesting it to flush to S3 and enter AttachedStale state
+2. Increment generation, and call to Node B requesting it to enter AttachedMulti
+   state with the new generation.
+3. Call to Node B, requesting it to download the latest hot layers from remote storage,
+   according to the latest heatmap flushed by Node A.
+4. Wait for Node B's WAL ingestion to catch up with node A's
+5. Update endpoints to use node B instead of node A
+6. Call to node B requesting it to enter state AttachedSingle.
+7. Call to node A requesting it to enter state Secondary
+
+The following table summarizes how the state of the system advances:
+
+|     Step      |     Node A     |     Node B     | Node used by endpoints |
+| :-----------: | :------------: | :------------: | :--------------------: |
+| 1 (_initial_) | AttachedSingle |   Secondary    |           A            |
+|       2       | AttachedStale  | AttachedMulti  |           A            |
+|       3       | AttachedStale  | AttachedMulti  |           A            |
+|       4       | AttachedStale  | AttachedMulti  |           A            |
+| 5 (_cutover_) | AttachedStale  | AttachedMulti  |           B            |
+|       6       | AttachedStale  | AttachedSingle |           B            |
+|  7 (_final_)  |   Secondary    | AttachedSingle |           B            |
+
+The procedure described for a clean handover from a live node to a secondary
+is also used for failure cases and for migrations to a location that is not
+configured as a secondary, by simply skipping irrelevant steps, as described in
+the following sections.
+
+#### Migration from an unresponsive node
+
+If node A is unavailable, then all calls into
+node A are skipped and we don't wait for B to catch up before
+switching updating the endpoints to use B.
+
+#### Migration to a location that is not a secondary
+
+If node B is initially in Detached state, the procedure is identical. Since Node B
+is coming from a Detached state rather than Secondary, the download of layers and
+catch up with WAL will take much longer.
+
+We might do this if:
+
+- Attached and secondary locations are both critically low on disk, and we need
+  to migrate to a third node with more resources available.
+- We are migrating a tenant which does not use secondary locations to save on cost.
+
+#### Permanent migration away from a node
+
+In the final step of the migration, we generally request the original node to enter a Secondary
+state. This is typical if we are doing a planned migration during maintenance, or to
+balance CPU/network load away from a node.
+
+One might also want to permanently migrate away: this can be done by simply removing the secondary
+location after the migration is complete, or as an optimization by substituting the Detached state
+for the Secondary state in the final step.
+
+#### Cutover diagram
+
+```mermaid
+sequenceDiagram
+participant CP as Control plane
+participant A as Node A
+participant B as Node B
+participant E as Endpoint
+
+CP->>A: PUT Flush & go to AttachedStale
+note right of A: A continues to ingest WAL
+CP->>B: PUT AttachedMulti
+CP->>B: PUT Download layers from latest heatmap
+note right of B: B downloads from S3
+loop Poll until download complete
+CP->>B: GET download status
+end
+activate B
+note right of B: B ingests WAL
+loop Poll until catch up
+CP->>B: GET visible WAL
+CP->>A: GET visible WAL
+end
+deactivate B
+CP->>E: Configure to use Node B
+E->>B: Connect for reads
+CP->>B: PUT AttachedSingle
+CP->>A: PUT Secondary
+```
+
+#### Cutover from an unavailable pageserver
+
+This case is far simpler: we may skip straight to our intended
+end state.
+
+```mermaid
+sequenceDiagram
+participant A as Node A
+participant CP as Control plane
+participant B as Node B
+participant E as Endpoint
+
+note right of A: Node A offline
+activate A
+CP->>B: PUT AttachedSingle
+CP->>E: Configure to use Node B
+E->>B: Connect for reads
+deactivate A
+```
+
+## Implementation (detail)
+
+### Purpose of AttachedMulti, AttachedStale
+
+#### AttachedMulti
+
+Ordinarily, an attached pageserver whose generation is the latest may delete
+layers at will (e.g. during compaction). If a previous generation pageserver
+is also still attached, and in use by endpoints, then this layer deletion could
+lead to a loss of availability for the endpoint when reading from the previous
+generation pageserver.
+
+The _AttachedMulti_ state simply disables deletions. These will be enqueued
+in `RemoteTimelineClient` until the control plane transitions the
+node into AttachedSingle, which unblocks deletions.  Other remote storage operations
+such as uploads are not blocked.
+
+AttachedMulti is not required for data safety, only to preserve availability
+on pageservers running with stale generations.
+
+A node enters AttachedMulti only when explicitly asked to by the control plane. It should
+only remain in this state for the duration of a migration.
+
+If a control plane bug leaves
+the node in AttachedMulti for a long time, then we must avoid unbounded memory use from enqueued
+deletions. This may be accomplished simply, by dropping enqueued deletions when some modest
+threshold of delayed deletions (e.g. 10k layers per tenant) is reached. As with all deletions,
+it is safe to skip them, and the leaked objects will be eventually cleaned up by scrub or
+by timeline deletion.
+
+During AttachedMulti, the Tenant is free to drop layers from local disk in response to
+disk pressure: only the deletion of remote layers is blocked.
+
+#### AttachedStale
+
+Currently, a pageserver with a stale generation number will continue to
+upload layers, but be prevented from completing deletions. This is safe, but inefficient: layers uploaded by this stale generation
+will not be read back by future generations of pageservers.
+
+The _AttachedStale_ state disables S3 uploads. The stale pageserver
+will continue to ingest the WAL and write layers to local disk, but not to
+do any uploads to S3.
+
+A node may enter AttachedStale in two ways:
+
+- Explicitly, when control plane calls into the node at the start of a migration.
+- Implicitly, when the node tries to validate some deletions and discovers
+  that its generation is stale.
+
+The AttachedStale state also disables sending consumption metrics from
+that location: it is interpreted as an indication that some other pageserver
+is already attached or is about to be attached, and that new pageserver will
+be responsible for sending consumption metrics.
+
+#### Disk Pressure & AttachedStale
+
+Over long periods of time, a tenant location in AttachedStale will accumulate data
+on local disk, as it cannot evict any layers written since it entered the
+AttachStale state. We rely on the control plane to revert the location to
+Secondary or Detached at the end of a migration.
+
+This scenario is particularly noteworthy when evacuating all tenants on a pageserver:
+since _all_ the attached tenants will go into AttachedStale, we will be doing no
+uploads at all, therefore ingested data will cause disk usage to increase continuously.
+Under nominal conditions, the available disk space on pageservers should be sufficient
+to complete the evacuation before this becomes a problem, but we must also handle
+the case where we hit a low disk situation while in this state.
+
+The concept of disk pressure already exists in the pageserver: the `disk_usage_eviction_task`
+touches each Tenant when it determines that a low-disk condition requires
+some layer eviction. Having selected layers for eviction, the eviction
+task calls `Timeline::evict_layers`.
+
+**Safety**: If evict_layers is called while in AttachedStale state, and some of the to-be-evicted
+layers are not yet uploaded to S3, then the block on uploads will be lifted. This
+will result in leaking some objects once a migration is complete, but will enable
+the node to manage its disk space properly: if a node is left with some tenants
+in AttachedStale indefinitely due to a network partition or control plane bug,
+these tenants will not cause a full disk condition.
+
+### Warm secondary updates
+
+#### Layer heatmap
+
+The secondary location's job is to serve reads **with the same quality of service as the original location
+was serving them around the time of a migration**. This does not mean the secondary
+location needs the whole set of layers: inactive layers that might soon
+be evicted on the attached pageserver need not be downloaded by the
+secondary. A totally idle tenant only needs to maintain enough on-disk
+state to enable a fast cold start (i.e. the most recent image layers are
+typically sufficient).
+
+To enable this, we introduce the concept of a _layer heatmap_, which
+acts as an advisory input to secondary locations to decide which
+layers to download from S3.
+
+#### Attached pageserver
+
+The attached pageserver, if in state AttachedSingle, periodically
+uploads a serialized heat map to S3. It may skip this if there
+is no change since the last time it uploaded (e.g. if the tenant
+is totally idle).
+
+Additionally, when the tenant is flushed to remote storage prior to a migration
+(the first step in [cutover procedure](#cutover-procedure)), 
+the heatmap is written out. This enables a future attached pageserver
+to get an up to date view when deciding which layers to download.
+
+#### Secondary location behavior
+
+Secondary warm locations run a simple loop, implemented separately from
+the main `Tenant` type, which represents attached tenants:
+
+- Download the layer heatmap
+- Select any "hot enough" layers to download, if there is sufficient
+  free disk space.
+- Download layers, if they were not previously evicted (see below)
+- Download the latest index_part.json
+- Check if any layers currently on disk are no longer referenced by
+  IndexPart & delete them
+
+Note that the heatmap is only advisory: if a secondary location has plenty
+of disk space, it may choose to retain layers that aren't referenced
+by the heatmap, as long as they are still referenced by the IndexPart. Conversely,
+if a node is very low on disk space, it might opt to raise the heat threshold required
+to both downloading a layer, until more disk space is available.
+
+#### Secondary locations & disk pressure
+
+Secondary locations are subject to eviction on disk pressure, just as
+attached locations are.  For eviction purposes, the access time of a
+layer in a secondary location will be the access time given in the heatmap,
+rather than the literal time at which the local layer file was accessed.
+
+The heatmap will indicate which layers are in local storage on the attached
+location.  The secondary will always attempt to get back to having that
+set of layers on disk, but to avoid flapping, it will remember the access
+time of the layer it was most recently asked to evict, and layers whose
+access time is below that will not be re-downloaded.
+
+The resulting behavior is that after a layer is evicted from a secondary
+location, it is only re-downloaded once the attached pageserver accesses
+the layer and uploads a heatmap reflecting that access time.  On a pageserver
+restart, the secondary location will attempt to download all layers in
+the heatmap again, if they are not on local disk.
+
+This behavior will be slightly different when secondary locations are
+used for "low energy tenants", but that is beyond the scope of this RFC.
+
+### Location configuration API
+
+Currently, the `/tenant/<tenant_id>/config` API defines various
+tunables like compaction settings, which apply to the tenant irrespective
+of which pageserver it is running on.
+
+A new "location config" structure will be introduced, which defines
+configuration which is per-tenant, but local to a particular pageserver,
+such as the attachment mode and whether it is a secondary.
+
+The pageserver will expose a new per-tenant API for setting
+the state: `/tenant/<tenant_id>/location/config`.
+
+Body content:
+
+```
+{
+  state: 'enum{Detached, Secondary, AttachedSingle, AttachedMulti, AttachedStale}',
+  generation: Option<u32>,
+  configuration: `Option<TenantConfig>`
+  flush: bool
+}
+```
+
+Existing `/attach` and `/detach` endpoint will have the same
+behavior as calling `/location/config` with `AttachedSingle` and `Detached`
+states respectively. These endpoints will be deprecated and later
+removed.
+
+The generation attribute is mandatory for entering `AttachedSingle` or
+`AttachedMulti`.
+
+The configuration attribute is mandatory when entering any state other
+than `Detached`. This configuration is the same as the body for
+the existing `/tenant/<tenant_id>/config` endpoint.
+
+The `flush` argument indicates whether the pageservers should flush
+to S3 before proceeding: this only has any effect if the node is
+currently in AttachedSingle or AttachedMulti. This is used
+during the first phase of migration, when transitioning the
+old pageserver to AttachedSingle.
+
+The `/re-attach` API response will be extended to include a `state` as
+well as a `generation`, enabling the pageserver to enter the
+correct state for each tenant on startup.
+
+### Database schema for locations
+
+A new table `ProjectLocation`:
+
+- pageserver_id: int
+- tenant_id: TenantId
+- generation: Option<int>
+- state: `enum(Secondary, AttachedSingle, AttachedMulti)`
+
+Notes:
+
+- It is legacy for a Project to have zero `ProjectLocation`s
+- The `pageserver` column in `Project` now means "to which pageserver should
+  endpoints connect", rather than simply which pageserver is attached.
+- The `generation` column in `Project` remains, and is incremented and used
+  to set the generation of `ProjectLocation` rows when they are set into
+  an attached state.
+- The `Detached` state is implicitly represented as the absence of
+  a `ProjectLocation`.
+
+### Executing migrations
+
+Migrations will be implemented as Go functions, within the
+existing `Operation` framework in the control plane. These
+operations are persistent, such that they will always keep
+trying until completion: this property is important to avoid
+leaving garbage behind on pageservers, such as AttachedStale
+locations.
+
+### Recovery from failures during migration
+
+During migration, the control plane may encounter failures of either
+the original or new pageserver, or both:
+
+- If the original fails, skip past waiting for the new pageserver
+  to catch up, and put it into AttachedSingle immediately.
+- If the new node fails, put the old pageserver into Secondary
+  and then back into AttachedSingle (this has the effect of
+  retaining on-disk state and granting it a fresh generation number).
+- If both nodes fail, keep trying until one of them is available
+  again.
+
+### Control plane -> Pageserver reconciliation
+
+A migration may be done while the old node is unavailable,
+in which case the old node may still be running in an AttachedStale
+state.
+
+In this case, it is undesirable to have the migration `Operation`
+stay alive until the old node eventually comes back online
+and can be cleaned up. To handle this, the control plane
+should run a background reconciliation process to compare
+a pageserver's attachments with the database, and clean up
+any that shouldn't be there any more.
+
+Note that there will be no work to do if the old node was really
+offline, as during startup it will call into `/re-attach` and
+be updated that way. The reconciliation will only be needed
+if the node was unavailable but still running.
+
+## Alternatives considered
+
+### Only enabling secondary locations for tenants on a higher service tier
+
+This will make sense in future, especially for tiny databases that may be
+downloaded from S3 in milliseconds when needed.
+
+However, it is not wise to do it immediately, because pageservers contain
+a mixture of higher and lower tier workloads. If we had 1 tenant with
+a secondary location and 9 without, then those other 9 tenants will do
+a lot of I/O as they try to recover from S3, which may degrade the
+service of the tenant which had a secondary location.
+
+Until we segregate tenant on different service tiers on different pageserver
+nodes, or implement & test QoS to ensure that tenants with secondaries are
+not harmed by tenants without, we should use the same failover approach
+for all the tenants.
+
+### Hot secondary locations (continuous WAL replay)
+
+Instead of secondary locations populating their caches from S3, we could
+have them consume the WAL from safekeepers. The downsides of this would be:
+
+- Double load on safekeepers, which are a less scalable service than S3
+- Secondary locations' on-disk state would end up subtly different to
+  the remote state, which would make synchronizing with S3 more complex/expensive
+  when going into attached state.
+
+The downside of only updating secondary locations from S3 is that we will
+have a delay during migration from replaying the LSN range between what's
+in S3 and what's in the pageserver. This range will be very small on
+planned migrations, as we have the old pageserver flush to S3 immediately
+before attaching the new pageserver. On unplanned migrations (old pageserver
+is unavailable), the range of LSNs to replay is bounded by the flush frequency
+on the old pageserver. However, the migration doesn't have to wait for the
+replay: it's just that not-yet-replayed LSNs will be unavailable for read
+until the new pageserver catches up.
+
+We expect that pageserver reads of the most recent LSNs will be relatively
+rare, as for an active endpoint those pages will usually still be in the postgres
+page cache: this leads us to prefer synchronizing from S3 on secondary
+locations, rather than consuming the WAL from safekeepers.
+
+### Cold secondary locations
+
+It is not functionally necessary to keep warm caches on secondary locations at all. However, if we do not, then
+we would experience a de-facto availability loss in unplanned migrations, as reads to the new node would take an extremely long time (many seconds, perhaps minutes).
+
+Warm caches on secondary locations are necessary to meet
+our availability goals.
+
+### Pageserver-granularity failover
+
+Instead of migrating tenants individually, we could have entire spare nodes,
+and on a node death, move all its work to one of these spares.
+
+This approach is avoided for several reasons:
+
+- we would still need fine-grained tenant migration for other
+  purposes such as balancing load
+- by sharing the spare capacity over many peers rather than one spare node,
+  these peers may use the capacity for other purposes, until it is needed
+  to handle migrated tenants. e.g. for keeping a deeper cache of their
+  attached tenants.
+
+### Readonly during migration
+
+We could simplify migrations by making both previous and new nodes go into a
+readonly state, then flush remote content from the previous node, then activate
+attachment on the secondary node.
+
+The downside to this approach is a potentially large gap in readability of
+recent LSNs while loading data onto the new node. To avoid this, it is worthwhile
+to incur the extra cost of double-replaying the WAL onto old and new nodes' local
+storage during a migration.
+
+### Peer-to-peer pageserver communication
+
+Rather than uploading the heatmap to S3, attached pageservers could make it
+available to peers.
+
+Currently, pageservers have no peer to peer communication, so adding this
+for heatmaps would incur significant overhead in deployment and configuration
+of the service, and ensuring that when a new pageserver is deployed, other
+pageservers are updated to be aware of it.
+
+As well as simplifying implementation, putting heatmaps in S3 will be useful
+for future analytics purposes -- gathering aggregated statistics on activity
+pattersn across many tenants may be done directly from data in S3.

From af28362a47341235b2d6cc81b7bb4b0159d18a5c Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 28 Sep 2023 12:25:20 +0300
Subject: [PATCH 11/24] tests: Default to LOCAL_FS for pageserver remote
 storage (#5402)

Part of #5172. Builds upon #5243, #5298. Includes the test changes:
- no more RemoteStorageKind.NOOP
- no more testing of pageserver without remote storage
- benchmarks now use LOCAL_FS as well

Support for running without RemoteStorage is still kept but in practice,
there are no tests and should not be any tests.

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/bin/pageserver.rs              |  2 +-
 test_runner/fixtures/compare_fixtures.py      |  2 +
 test_runner/fixtures/neon_fixtures.py         | 23 ++++-
 test_runner/fixtures/remote_storage.py        |  8 +-
 test_runner/regress/test_broken_timeline.py   | 52 +++++------
 .../test_pageserver_metric_collection.py      | 23 ++---
 test_runner/regress/test_tenant_delete.py     | 44 +++-------
 test_runner/regress/test_tenant_detach.py     |  7 +-
 test_runner/regress/test_tenants.py           | 19 +---
 test_runner/regress/test_timeline_delete.py   | 40 ++++-----
 test_runner/regress/test_timeline_size.py     | 88 ++++++-------------
 11 files changed, 109 insertions(+), 199 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 90c7c11194..d8a00b677b 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -644,7 +644,7 @@ fn create_remote_storage_client(
     let config = if let Some(config) = &conf.remote_storage_config {
         config
     } else {
-        // No remote storage configured.
+        tracing::warn!("no remote storage configured, this is a deprecated configuration");
         return Ok(None);
     };
 
diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 1254c4e779..6fbaa08512 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -105,6 +105,8 @@ class NeonCompare(PgCompare):
         self._pg_bin = pg_bin
         self.pageserver_http_client = self.env.pageserver.http_client()
 
+        # note that neon_simple_env now uses LOCAL_FS remote storage
+
         # Create tenant
         tenant_conf: Dict[str, str] = {}
         if False:  # TODO add pytest setting for this
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 38d0aeb960..92e7cd06cd 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -460,9 +460,11 @@ class NeonEnvBuilder:
         ), "Unexpectedly instantiated from outside a test function"
         self.test_name = test_name
 
-    def init_configs(self) -> NeonEnv:
+    def init_configs(self, default_remote_storage_if_missing: bool = True) -> NeonEnv:
         # Cannot create more than one environment from one builder
         assert self.env is None, "environment already initialized"
+        if default_remote_storage_if_missing and self.pageserver_remote_storage is None:
+            self.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
         self.env = NeonEnv(self)
         return self.env
 
@@ -470,8 +472,19 @@ class NeonEnvBuilder:
         assert self.env is not None, "environment is not already initialized, call init() first"
         self.env.start()
 
-    def init_start(self, initial_tenant_conf: Optional[Dict[str, str]] = None) -> NeonEnv:
-        env = self.init_configs()
+    def init_start(
+        self,
+        initial_tenant_conf: Optional[Dict[str, str]] = None,
+        default_remote_storage_if_missing: bool = True,
+    ) -> NeonEnv:
+        """
+        Default way to create and start NeonEnv. Also creates the initial_tenant with root initial_timeline.
+
+        To avoid creating initial_tenant, call init_configs to setup the environment.
+
+        Configuring pageserver with remote storage is now the default. There will be a warning if pageserver is created without one.
+        """
+        env = self.init_configs(default_remote_storage_if_missing=default_remote_storage_if_missing)
         self.start()
 
         # Prepare the default branch to start the postgres on later.
@@ -546,7 +559,7 @@ class NeonEnvBuilder:
         user: RemoteStorageUser,
         bucket_name: Optional[str] = None,
         bucket_region: Optional[str] = None,
-    ) -> Optional[RemoteStorage]:
+    ) -> RemoteStorage:
         ret = kind.configure(
             self.repo_dir,
             self.mock_s3_server,
@@ -889,6 +902,8 @@ def _shared_simple_env(
     """
     # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
      is set, this is shared by all tests using `neon_simple_env`.
+
+    This fixture will use RemoteStorageKind.LOCAL_FS with pageserver.
     """
 
     if os.environ.get("TEST_SHARED_FIXTURES") is None:
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index f7cddbc821..535f8c2fe7 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -202,9 +202,6 @@ class RemoteStorageKind(str, enum.Enum):
     LOCAL_FS = "local_fs"
     MOCK_S3 = "mock_s3"
     REAL_S3 = "real_s3"
-    # Pass to tests that are generic to remote storage
-    # to ensure the test pass with or without the remote storage
-    NOOP = "noop"
 
     def configure(
         self,
@@ -215,10 +212,7 @@ class RemoteStorageKind(str, enum.Enum):
         user: RemoteStorageUser,
         bucket_name: Optional[str] = None,
         bucket_region: Optional[str] = None,
-    ) -> Optional[RemoteStorage]:
-        if self == RemoteStorageKind.NOOP:
-            return None
-
+    ) -> RemoteStorage:
         if self == RemoteStorageKind.LOCAL_FS:
             return LocalFsStorage(LocalFsStorage.component_path(repo_dir, user))
 
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index d0462844f0..c80f2d8360 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -4,7 +4,12 @@ from typing import List, Tuple
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder
+from fixtures.neon_fixtures import (
+    Endpoint,
+    NeonEnv,
+    NeonEnvBuilder,
+    wait_for_last_flush_lsn,
+)
 from fixtures.types import TenantId, TimelineId
 
 
@@ -26,17 +31,18 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
 
     tenant_timelines: List[Tuple[TenantId, TimelineId, Endpoint]] = []
 
-    for _ in range(4):
+    for _ in range(3):
         tenant_id, timeline_id = env.neon_cli.create_tenant()
 
         endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
         with endpoint.cursor() as cur:
             cur.execute("CREATE TABLE t(key int primary key, value text)")
             cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'")
+            wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
         endpoint.stop()
         tenant_timelines.append((tenant_id, timeline_id, endpoint))
 
-    # Stop the pageserver
+    # Stop the pageserver -- this has to be not immediate or we need to wait for uploads
     env.pageserver.stop()
 
     # Leave the first timeline alone, but corrupt the others in different ways
@@ -45,30 +51,21 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
 
     (tenant1, timeline1, pg1) = tenant_timelines[1]
     metadata_path = f"{env.pageserver.workdir}/tenants/{tenant1}/timelines/{timeline1}/metadata"
-    f = open(metadata_path, "w")
-    f.write("overwritten with garbage!")
-    f.close()
+    with open(metadata_path, "w") as f:
+        f.write("overwritten with garbage!")
     log.info(f"Timeline {tenant1}/{timeline1} got its metadata spoiled")
 
     (tenant2, timeline2, pg2) = tenant_timelines[2]
     timeline_path = f"{env.pageserver.workdir}/tenants/{tenant2}/timelines/{timeline2}/"
-    for filename in os.listdir(timeline_path):
-        if filename.startswith("00000"):
-            # Looks like a layer file. Remove it
-            os.remove(f"{timeline_path}/{filename}")
-    log.info(
-        f"Timeline {tenant2}/{timeline2} got its layer files removed (no remote storage enabled)"
-    )
-
-    (tenant3, timeline3, pg3) = tenant_timelines[3]
-    timeline_path = f"{env.pageserver.workdir}/tenants/{tenant3}/timelines/{timeline3}/"
     for filename in os.listdir(timeline_path):
         if filename.startswith("00000"):
             # Looks like a layer file. Corrupt it
-            f = open(f"{timeline_path}/{filename}", "w")
-            f.write("overwritten with garbage!")
-            f.close()
-    log.info(f"Timeline {tenant3}/{timeline3} got its layer files spoiled")
+            p = f"{timeline_path}/{filename}"
+            size = os.path.getsize(p)
+            with open(p, "wb") as f:
+                f.truncate(0)
+                f.truncate(size)
+    log.info(f"Timeline {tenant2}/{timeline2} got its local layer files spoiled")
 
     env.pageserver.start()
 
@@ -87,22 +84,13 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
         f"As expected, compute startup failed eagerly for timeline with corrupt metadata: {err}"
     )
 
-    # Second timeline has no ancestors, only the metadata file and no layer files locally,
-    # and we don't have the remote storage enabled. It is loaded into memory, but getting
-    # the basebackup from it will fail.
-    with pytest.raises(
-        Exception, match=f"Tenant {tenant2} will not become active. Current state: Broken"
-    ) as err:
-        pg2.start()
-    log.info(f"As expected, compute startup failed for timeline with missing layers: {err}")
-
-    # Third timeline will also fail during basebackup, because the layer file is corrupt.
+    # Second timeline will fail during basebackup, because the local layer file is corrupt.
     # It will fail when we try to read (and reconstruct) a page from it, ergo the error message.
     # (We don't check layer file contents on startup, when loading the timeline)
     with pytest.raises(Exception, match="Failed to load delta layer") as err:
-        pg3.start()
+        pg2.start()
     log.info(
-        f"As expected, compute startup failed for timeline {tenant3}/{timeline3} with corrupt layers: {err}"
+        f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"
     )
 
 
diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py
index 74e016a9df..b76dbbee03 100644
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -5,7 +5,6 @@ from pathlib import Path
 from queue import SimpleQueue
 from typing import Any, Dict, Set
 
-import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -17,18 +16,13 @@ from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
 
-
 # TODO: collect all of the env setup *AFTER* removal of RemoteStorageKind.NOOP
 
 
-@pytest.mark.parametrize(
-    "remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.LOCAL_FS]
-)
 def test_metric_collection(
     httpserver: HTTPServer,
     neon_env_builder: NeonEnvBuilder,
     httpserver_listen_address,
-    remote_storage_kind: RemoteStorageKind,
 ):
     (host, port) = httpserver_listen_address
     metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
@@ -58,7 +52,7 @@ def test_metric_collection(
         synthetic_size_calculation_interval="3s"
         """
 
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
     log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}")
 
@@ -109,17 +103,14 @@ def test_metric_collection(
             total += sample[2]
         return int(total)
 
-    remote_uploaded = 0
-
     # upload some data to remote storage
-    if remote_storage_kind == RemoteStorageKind.LOCAL_FS:
-        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
-        pageserver_http = env.pageserver.http_client()
-        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-        pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+    pageserver_http = env.pageserver.http_client()
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+    pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
 
-        remote_uploaded = get_num_remote_ops("index", "upload")
-        assert remote_uploaded > 0
+    remote_uploaded = get_num_remote_ops("index", "upload")
+    assert remote_uploaded > 0
 
     # we expect uploads at 1Hz, on busy runners this could be too optimistic,
     # so give 5s we only want to get the following upload after "ready" value.
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 9d0d42a4ef..3a56ca51a6 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -30,9 +30,7 @@ from fixtures.types import TenantId
 from fixtures.utils import run_pg_bench_small
 
 
-@pytest.mark.parametrize(
-    "remote_storage_kind", [RemoteStorageKind.NOOP, *available_remote_storages()]
-)
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
 def test_tenant_delete_smoke(
     neon_env_builder: NeonEnvBuilder,
     remote_storage_kind: RemoteStorageKind,
@@ -144,18 +142,12 @@ FAILPOINTS_BEFORE_BACKGROUND = [
 def combinations():
     result = []
 
-    remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3]
+    remotes = [RemoteStorageKind.MOCK_S3]
     if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"):
         remotes.append(RemoteStorageKind.REAL_S3)
 
     for remote_storage_kind in remotes:
         for delete_failpoint in FAILPOINTS:
-            if remote_storage_kind is RemoteStorageKind.NOOP and delete_failpoint in (
-                "timeline-delete-before-index-delete",
-            ):
-                # the above failpoint are not relevant for config without remote storage
-                continue
-
             # Simulate failures for only one type of remote storage
             # to avoid log pollution and make tests run faster
             if remote_storage_kind is RemoteStorageKind.MOCK_S3:
@@ -215,21 +207,18 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
     with env.endpoints.create_start("delete", tenant_id=tenant_id) as endpoint:
         # generate enough layers
         run_pg_bench_small(pg_bin, endpoint.connstr())
-        if remote_storage_kind is RemoteStorageKind.NOOP:
-            wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
-        else:
-            last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+        last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
 
-            if remote_storage_kind in available_s3_storages():
-                assert_prefix_not_empty(
-                    neon_env_builder,
-                    prefix="/".join(
-                        (
-                            "tenants",
-                            str(tenant_id),
-                        )
-                    ),
-                )
+        if remote_storage_kind in available_s3_storages():
+            assert_prefix_not_empty(
+                neon_env_builder,
+                prefix="/".join(
+                    (
+                        "tenants",
+                        str(tenant_id),
+                    )
+                ),
+            )
 
     ps_http.configure_failpoints((failpoint, "return"))
 
@@ -260,12 +249,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
         env.pageserver.stop()
         env.pageserver.start()
 
-        if (
-            remote_storage_kind is RemoteStorageKind.NOOP
-            and failpoint == "tenant-delete-before-create-local-mark"
-        ):
-            tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
-        elif failpoint in (
+        if failpoint in (
             "tenant-delete-before-shutdown",
             "tenant-delete-before-create-remote-mark",
         ):
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 8ccbcf551d..a20523b1f3 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -519,11 +519,8 @@ def test_detach_while_attaching(
 # * restart the pageserver and verify that ignored tenant is still not loaded
 # * `load` the same tenant
 # * ensure that it's status is `Active` and it's present in pageserver's memory with all timelines
-@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3])
-def test_ignored_tenant_reattach(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
-):
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+def test_ignored_tenant_reattach(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
 
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 677c0d18e8..40dff194aa 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -12,7 +12,6 @@ from fixtures.log_helper import log
 from fixtures.metrics import (
     PAGESERVER_GLOBAL_METRICS,
     PAGESERVER_PER_TENANT_METRICS,
-    PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     parse_metrics,
 )
 from fixtures.neon_fixtures import (
@@ -232,17 +231,10 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
         assert value
 
 
-@pytest.mark.parametrize(
-    "remote_storage_kind",
-    # exercise both the code paths where remote_storage=None and remote_storage=Some(...)
-    [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3],
-)
-def test_pageserver_metrics_removed_after_detach(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
-):
+def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilder):
     """Tests that when a tenant is detached, the tenant specific metrics are not left behind"""
 
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
 
     neon_env_builder.num_safekeepers = 3
 
@@ -282,9 +274,6 @@ def test_pageserver_metrics_removed_after_detach(
     for tenant in [tenant_1, tenant_2]:
         pre_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)])
         expected = set(PAGESERVER_PER_TENANT_METRICS)
-        if remote_storage_kind == RemoteStorageKind.NOOP:
-            # if there's no remote storage configured, we don't expose the remote timeline client metrics
-            expected -= set(PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS)
         assert pre_detach_samples == expected
 
         env.pageserver.http_client().tenant_detach(tenant)
@@ -294,9 +283,7 @@ def test_pageserver_metrics_removed_after_detach(
 
 
 # Check that empty tenants work with or without the remote storage
-@pytest.mark.parametrize(
-    "remote_storage_kind", available_remote_storages() + [RemoteStorageKind.NOOP]
-)
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
 def test_pageserver_with_empty_tenants(
     neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
 ):
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 839df69240..3af144c31c 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -12,7 +12,6 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
     last_flush_lsn_upload,
-    wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
@@ -145,19 +144,12 @@ DELETE_FAILPOINTS = [
 def combinations():
     result = []
 
-    remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3]
+    remotes = [RemoteStorageKind.MOCK_S3]
     if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"):
         remotes.append(RemoteStorageKind.REAL_S3)
 
     for remote_storage_kind in remotes:
         for delete_failpoint in DELETE_FAILPOINTS:
-            if remote_storage_kind == RemoteStorageKind.NOOP and delete_failpoint in (
-                "timeline-delete-before-index-delete",
-                "timeline-delete-after-index-delete",
-            ):
-                # the above failpoints are not relevant for config without remote storage
-                continue
-
             result.append((remote_storage_kind, delete_failpoint))
     return result
 
@@ -205,23 +197,21 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
     with env.endpoints.create_start("delete") as endpoint:
         # generate enough layers
         run_pg_bench_small(pg_bin, endpoint.connstr())
-        if remote_storage_kind is RemoteStorageKind.NOOP:
-            wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline_id)
-        else:
-            last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
 
-            if remote_storage_kind in available_s3_storages():
-                assert_prefix_not_empty(
-                    neon_env_builder,
-                    prefix="/".join(
-                        (
-                            "tenants",
-                            str(env.initial_tenant),
-                            "timelines",
-                            str(timeline_id),
-                        )
-                    ),
-                )
+        last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
+
+        if remote_storage_kind in available_s3_storages():
+            assert_prefix_not_empty(
+                neon_env_builder,
+                prefix="/".join(
+                    (
+                        "tenants",
+                        str(env.initial_tenant),
+                        "timelines",
+                        str(timeline_id),
+                    )
+                ),
+            )
 
     env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}")
     # It appears when we stopped flush loop during deletion and then pageserver is stopped
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index f856b26c6e..c2e93c48c7 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -301,12 +301,8 @@ def test_timeline_initial_logical_size_calculation_cancellation(
     # message emitted by the code behind failpoint "timeline-calculate-logical-size-check-dir-exists"
 
 
-@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
-def test_timeline_physical_size_init(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
-):
-    if remote_storage_kind is not None:
-        neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+def test_timeline_physical_size_init(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
     env = neon_env_builder.init_start()
 
@@ -337,17 +333,12 @@ def test_timeline_physical_size_init(
     )
 
     assert_physical_size_invariants(
-        get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
-        remote_storage_kind,
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id),
     )
 
 
-@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
-def test_timeline_physical_size_post_checkpoint(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
-):
-    if remote_storage_kind is not None:
-        neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+def test_timeline_physical_size_post_checkpoint(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
     env = neon_env_builder.init_start()
 
@@ -369,19 +360,14 @@ def test_timeline_physical_size_post_checkpoint(
 
     def check():
         assert_physical_size_invariants(
-            get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
-            remote_storage_kind,
+            get_physical_size_values(env, env.initial_tenant, new_timeline_id),
         )
 
     wait_until(10, 1, check)
 
 
-@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
-def test_timeline_physical_size_post_compaction(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
-):
-    if remote_storage_kind is not None:
-        neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
     # Disable background compaction as we don't want it to happen after `get_physical_size` request
     # and before checking the expected size on disk, which makes the assertion failed
@@ -420,21 +406,15 @@ def test_timeline_physical_size_post_compaction(
     pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
     pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id)
 
-    if remote_storage_kind is not None:
-        wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)
+    wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)
 
     assert_physical_size_invariants(
-        get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
-        remote_storage_kind,
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id),
     )
 
 
-@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
-def test_timeline_physical_size_post_gc(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
-):
-    if remote_storage_kind is not None:
-        neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
     # Disable background compaction and GC as we don't want it to happen after `get_physical_size` request
     # and before checking the expected size on disk, which makes the assertion failed
@@ -471,12 +451,10 @@ def test_timeline_physical_size_post_gc(
     pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
     pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None)
 
-    if remote_storage_kind is not None:
-        wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)
+    wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)
 
     assert_physical_size_invariants(
-        get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
-        remote_storage_kind,
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id),
     )
 
 
@@ -560,14 +538,10 @@ def test_timeline_size_metrics(
     assert math.isclose(dbsize_sum, tl_logical_size_metric, abs_tol=2 * 1024 * 1024)
 
 
-@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
-def test_tenant_physical_size(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
-):
+def test_tenant_physical_size(neon_env_builder: NeonEnvBuilder):
     random.seed(100)
 
-    if remote_storage_kind is not None:
-        neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
     env = neon_env_builder.init_start()
 
@@ -575,12 +549,10 @@ def test_tenant_physical_size(
     client = env.pageserver.http_client()
 
     tenant, timeline = env.neon_cli.create_tenant()
-    if remote_storage_kind is not None:
-        wait_for_upload_queue_empty(pageserver_http, tenant, timeline)
 
     def get_timeline_resident_physical_size(timeline: TimelineId):
-        sizes = get_physical_size_values(env, tenant, timeline, remote_storage_kind)
-        assert_physical_size_invariants(sizes, remote_storage_kind)
+        sizes = get_physical_size_values(env, tenant, timeline)
+        assert_physical_size_invariants(sizes)
         return sizes.prometheus_resident_physical
 
     timeline_total_resident_physical_size = get_timeline_resident_physical_size(timeline)
@@ -600,8 +572,7 @@ def test_tenant_physical_size(
         wait_for_last_flush_lsn(env, endpoint, tenant, timeline)
         pageserver_http.timeline_checkpoint(tenant, timeline)
 
-        if remote_storage_kind is not None:
-            wait_for_upload_queue_empty(pageserver_http, tenant, timeline)
+        wait_for_upload_queue_empty(pageserver_http, tenant, timeline)
 
         timeline_total_resident_physical_size += get_timeline_resident_physical_size(timeline)
 
@@ -630,7 +601,6 @@ def get_physical_size_values(
     env: NeonEnv,
     tenant_id: TenantId,
     timeline_id: TimelineId,
-    remote_storage_kind: Optional[RemoteStorageKind],
 ) -> TimelinePhysicalSizeValues:
     res = TimelinePhysicalSizeValues()
 
@@ -646,12 +616,9 @@ def get_physical_size_values(
     res.prometheus_resident_physical = metrics.query_one(
         "pageserver_resident_physical_size", metrics_filter
     ).value
-    if remote_storage_kind is not None:
-        res.prometheus_remote_physical = metrics.query_one(
-            "pageserver_remote_physical_size", metrics_filter
-        ).value
-    else:
-        res.prometheus_remote_physical = None
+    res.prometheus_remote_physical = metrics.query_one(
+        "pageserver_remote_physical_size", metrics_filter
+    ).value
 
     detail = client.timeline_detail(
         tenant_id, timeline_id, include_timeline_dir_layer_file_size_sum=True
@@ -664,20 +631,15 @@ def get_physical_size_values(
     return res
 
 
-def assert_physical_size_invariants(
-    sizes: TimelinePhysicalSizeValues, remote_storage_kind: Optional[RemoteStorageKind]
-):
+def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
     # resident phyiscal size is defined as
     assert sizes.python_timelinedir_layerfiles_physical == sizes.prometheus_resident_physical
     assert sizes.python_timelinedir_layerfiles_physical == sizes.layer_map_file_size_sum
 
     # we don't do layer eviction, so, all layers are resident
     assert sizes.api_current_physical == sizes.prometheus_resident_physical
-    if remote_storage_kind is not None:
-        assert sizes.prometheus_resident_physical == sizes.prometheus_remote_physical
-        # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS
-    else:
-        assert sizes.prometheus_remote_physical is None
+    assert sizes.prometheus_resident_physical == sizes.prometheus_remote_physical
+    # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS
 
 
 # Timeline logical size initialization is an asynchronous background task that runs once,

From 528fb1bd81ea4a71e903e1b1be3c3b08ced0ce73 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 28 Sep 2023 11:38:26 +0100
Subject: [PATCH 12/24] proxy: metrics2 (#5179)

## Problem

We need to count metrics always when a connection is open. Not only when
the transfer is 0.

We also need to count bytes usage for HTTP.

## Summary of changes

New structure for usage metrics. A `DashMap<Ids, Arc<Counters>>`.

If the arc has 1 owner (the map) then I can conclude that no connections
are open.
If the counters has "open_connections" non zero, then I can conclude a
new connection was opened in the last interval and should be reported
on.

Also, keep count of how many bytes processed for HTTP and report it
here.
---
 Cargo.lock                          |   1 +
 Cargo.toml                          |   1 +
 libs/consumption_metrics/src/lib.rs |   2 +-
 proxy/Cargo.toml                    |   1 +
 proxy/src/http/conn_pool.rs         |  20 +-
 proxy/src/http/sql_over_http.rs     | 102 ++++++--
 proxy/src/http/websocket.rs         |  43 +---
 proxy/src/metrics.rs                | 363 ++++++++++++++++++----------
 proxy/src/proxy.rs                  |   7 +
 9 files changed, 347 insertions(+), 193 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 55c80e30a7..b22f081bdc 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3246,6 +3246,7 @@ dependencies = [
  "reqwest-tracing",
  "routerify",
  "rstest",
+ "rustc-hash",
  "rustls",
  "rustls-pemfile",
  "scopeguard",
diff --git a/Cargo.toml b/Cargo.toml
index 4fe3069822..b0bcf69039 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -107,6 +107,7 @@ reqwest-middleware = "0.2.0"
 reqwest-retry = "0.2.2"
 routerify = "3"
 rpds = "0.13"
+rustc-hash = "1.1.0"
 rustls = "0.21"
 rustls-pemfile = "1"
 rustls-split = "0.3"
diff --git a/libs/consumption_metrics/src/lib.rs b/libs/consumption_metrics/src/lib.rs
index 7b133c61af..9e89327e84 100644
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -107,7 +107,7 @@ pub const CHUNK_SIZE: usize = 1000;
 
 // Just a wrapper around a slice of events
 // to serialize it as `{"events" : [ ] }
-#[derive(serde::Serialize)]
+#[derive(serde::Serialize, serde::Deserialize)]
 pub struct EventChunk<'a, T: Clone> {
     pub events: std::borrow::Cow<'a, [T]>,
 }
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index cbab0c6f07..92498d3ecd 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -42,6 +42,7 @@ reqwest-middleware.workspace = true
 reqwest-retry.workspace = true
 reqwest-tracing.workspace = true
 routerify.workspace = true
+rustc-hash.workspace = true
 rustls-pemfile.workspace = true
 rustls.workspace = true
 scopeguard.workspace = true
diff --git a/proxy/src/http/conn_pool.rs b/proxy/src/http/conn_pool.rs
index e771e5d7ed..a7ef15d342 100644
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -17,11 +17,12 @@ use std::{
 use tokio::time;
 use tokio_postgres::AsyncMessage;
 
-use crate::{auth, console};
+use crate::{
+    auth, console,
+    metrics::{Ids, MetricCounter, USAGE_METRICS},
+};
 use crate::{compute, config};
 
-use super::sql_over_http::MAX_RESPONSE_SIZE;
-
 use crate::proxy::ConnectMechanism;
 
 use tracing::{error, warn};
@@ -400,7 +401,6 @@ async fn connect_to_compute_once(
         .user(&conn_info.username)
         .password(&conn_info.password)
         .dbname(&conn_info.dbname)
-        .max_backend_message_size(MAX_RESPONSE_SIZE)
         .connect_timeout(timeout)
         .connect(tokio_postgres::NoTls)
         .await?;
@@ -412,6 +412,10 @@ async fn connect_to_compute_once(
     span.in_scope(|| {
         info!(%conn_info, %session, "new connection");
     });
+    let ids = Ids {
+        endpoint_id: node_info.aux.endpoint_id.to_string(),
+        branch_id: node_info.aux.branch_id.to_string(),
+    };
 
     tokio::spawn(
         poll_fn(move |cx| {
@@ -450,10 +454,18 @@ async fn connect_to_compute_once(
     Ok(Client {
         inner: client,
         session: tx,
+        ids,
     })
 }
 
 pub struct Client {
     pub inner: tokio_postgres::Client,
     session: tokio::sync::watch::Sender<uuid::Uuid>,
+    ids: Ids,
+}
+
+impl Client {
+    pub fn metrics(&self) -> Arc<MetricCounter> {
+        USAGE_METRICS.register(self.ids.clone())
+    }
 }
diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs
index fe57096105..b74b3e9646 100644
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -3,10 +3,12 @@ use std::sync::Arc;
 use anyhow::bail;
 use futures::pin_mut;
 use futures::StreamExt;
-use hashbrown::HashMap;
 use hyper::body::HttpBody;
+use hyper::header;
 use hyper::http::HeaderName;
 use hyper::http::HeaderValue;
+use hyper::Response;
+use hyper::StatusCode;
 use hyper::{Body, HeaderMap, Request};
 use serde_json::json;
 use serde_json::Map;
@@ -16,7 +18,11 @@ use tokio_postgres::types::Type;
 use tokio_postgres::GenericClient;
 use tokio_postgres::IsolationLevel;
 use tokio_postgres::Row;
+use tracing::error;
+use tracing::instrument;
 use url::Url;
+use utils::http::error::ApiError;
+use utils::http::json::json_response;
 
 use super::conn_pool::ConnInfo;
 use super::conn_pool::GlobalConnPool;
@@ -39,7 +45,6 @@ enum Payload {
     Batch(BatchQueryData),
 }
 
-pub const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MB
 const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
 
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
@@ -182,7 +187,45 @@ pub async fn handle(
     sni_hostname: Option<String>,
     conn_pool: Arc<GlobalConnPool>,
     session_id: uuid::Uuid,
-) -> anyhow::Result<(Value, HashMap<HeaderName, HeaderValue>)> {
+) -> Result<Response<Body>, ApiError> {
+    let result = handle_inner(request, sni_hostname, conn_pool, session_id).await;
+
+    let mut response = match result {
+        Ok(r) => r,
+        Err(e) => {
+            let message = format!("{:?}", e);
+            let code = match e.downcast_ref::<tokio_postgres::Error>() {
+                Some(e) => match e.code() {
+                    Some(e) => serde_json::to_value(e.code()).unwrap(),
+                    None => Value::Null,
+                },
+                None => Value::Null,
+            };
+            error!(
+                ?code,
+                "sql-over-http per-client task finished with an error: {e:#}"
+            );
+            // TODO: this shouldn't always be bad request.
+            json_response(
+                StatusCode::BAD_REQUEST,
+                json!({ "message": message, "code": code }),
+            )?
+        }
+    };
+    response.headers_mut().insert(
+        "Access-Control-Allow-Origin",
+        hyper::http::HeaderValue::from_static("*"),
+    );
+    Ok(response)
+}
+
+#[instrument(name = "sql-over-http", skip_all)]
+async fn handle_inner(
+    request: Request<Body>,
+    sni_hostname: Option<String>,
+    conn_pool: Arc<GlobalConnPool>,
+    session_id: uuid::Uuid,
+) -> anyhow::Result<Response<Body>> {
     //
     // Determine the destination and connection params
     //
@@ -233,13 +276,18 @@ pub async fn handle(
 
     let mut client = conn_pool.get(&conn_info, !allow_pool, session_id).await?;
 
+    let mut response = Response::builder()
+        .status(StatusCode::OK)
+        .header(header::CONTENT_TYPE, "application/json");
+
     //
     // Now execute the query and return the result
     //
+    let mut size = 0;
     let result = match payload {
-        Payload::Single(query) => query_to_json(&client.inner, query, raw_output, array_mode)
-            .await
-            .map(|x| (x, HashMap::default())),
+        Payload::Single(query) => {
+            query_to_json(&client.inner, query, &mut size, raw_output, array_mode).await
+        }
         Payload::Batch(batch_query) => {
             let mut results = Vec::new();
             let mut builder = client.inner.build_transaction();
@@ -254,7 +302,8 @@ pub async fn handle(
             }
             let transaction = builder.start().await?;
             for query in batch_query.queries {
-                let result = query_to_json(&transaction, query, raw_output, array_mode).await;
+                let result =
+                    query_to_json(&transaction, query, &mut size, raw_output, array_mode).await;
                 match result {
                     Ok(r) => results.push(r),
                     Err(e) => {
@@ -264,26 +313,27 @@ pub async fn handle(
                 }
             }
             transaction.commit().await?;
-            let mut headers = HashMap::default();
             if txn_read_only {
-                headers.insert(
+                response = response.header(
                     TXN_READ_ONLY.clone(),
                     HeaderValue::try_from(txn_read_only.to_string())?,
                 );
             }
             if txn_deferrable {
-                headers.insert(
+                response = response.header(
                     TXN_DEFERRABLE.clone(),
                     HeaderValue::try_from(txn_deferrable.to_string())?,
                 );
             }
             if let Some(txn_isolation_level) = txn_isolation_level_raw {
-                headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
+                response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
             }
-            Ok((json!({ "results": results }), headers))
+            Ok(json!({ "results": results }))
         }
     };
 
+    let metrics = client.metrics();
+
     if allow_pool {
         let current_span = tracing::Span::current();
         // return connection to the pool
@@ -293,12 +343,30 @@ pub async fn handle(
         });
     }
 
-    result
+    match result {
+        Ok(value) => {
+            // how could this possibly fail
+            let body = serde_json::to_string(&value).expect("json serialization should not fail");
+            let len = body.len();
+            let response = response
+                .body(Body::from(body))
+                // only fails if invalid status code or invalid header/values are given.
+                // these are not user configurable so it cannot fail dynamically
+                .expect("building response payload should not fail");
+
+            // count the egress bytes - we miss the TLS and header overhead but oh well...
+            // moving this later in the stack is going to be a lot of effort and ehhhh
+            metrics.record_egress(len as u64);
+            Ok(response)
+        }
+        Err(e) => Err(e),
+    }
 }
 
 async fn query_to_json<T: GenericClient>(
     client: &T,
     data: QueryData,
+    current_size: &mut usize,
     raw_output: bool,
     array_mode: bool,
 ) -> anyhow::Result<Value> {
@@ -312,16 +380,10 @@ async fn query_to_json<T: GenericClient>(
     // big.
     pin_mut!(row_stream);
     let mut rows: Vec<tokio_postgres::Row> = Vec::new();
-    let mut current_size = 0;
     while let Some(row) = row_stream.next().await {
         let row = row?;
-        current_size += row.body_len();
+        *current_size += row.body_len();
         rows.push(row);
-        if current_size > MAX_RESPONSE_SIZE {
-            return Err(anyhow::anyhow!(
-                "response is too large (max is {MAX_RESPONSE_SIZE} bytes)"
-            ));
-        }
     }
 
     // grab the command tag and number of rows affected
diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs
index fa66df0469..994a7de764 100644
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -7,7 +7,6 @@ use crate::{
 };
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream, StreamExt};
-use hashbrown::HashMap;
 use hyper::{
     server::{
         accept,
@@ -18,7 +17,6 @@ use hyper::{
 };
 use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
 use pin_project_lite::pin_project;
-use serde_json::{json, Value};
 
 use std::{
     convert::Infallible,
@@ -204,44 +202,7 @@ async fn ws_handler(
     // TODO: that deserves a refactor as now this function also handles http json client besides websockets.
     // Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
     } else if request.uri().path() == "/sql" && request.method() == Method::POST {
-        let result = sql_over_http::handle(request, sni_hostname, conn_pool, session_id)
-            .instrument(info_span!("sql-over-http"))
-            .await;
-        let status_code = match result {
-            Ok(_) => StatusCode::OK,
-            Err(_) => StatusCode::BAD_REQUEST,
-        };
-        let (json, headers) = match result {
-            Ok(r) => r,
-            Err(e) => {
-                let message = format!("{:?}", e);
-                let code = match e.downcast_ref::<tokio_postgres::Error>() {
-                    Some(e) => match e.code() {
-                        Some(e) => serde_json::to_value(e.code()).unwrap(),
-                        None => Value::Null,
-                    },
-                    None => Value::Null,
-                };
-                error!(
-                    ?code,
-                    "sql-over-http per-client task finished with an error: {e:#}"
-                );
-                (
-                    json!({ "message": message, "code": code }),
-                    HashMap::default(),
-                )
-            }
-        };
-        json_response(status_code, json).map(|mut r| {
-            r.headers_mut().insert(
-                "Access-Control-Allow-Origin",
-                hyper::http::HeaderValue::from_static("*"),
-            );
-            for (k, v) in headers {
-                r.headers_mut().insert(k, v);
-            }
-            r
-        })
+        sql_over_http::handle(request, sni_hostname, conn_pool, session_id).await
     } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
         Response::builder()
             .header("Allow", "OPTIONS, POST")
@@ -253,7 +214,7 @@ async fn ws_handler(
             .header("Access-Control-Max-Age", "86400" /* 24 hours */)
             .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
             .body(Body::empty())
-            .map_err(|e| ApiError::BadRequest(e.into()))
+            .map_err(|e| ApiError::InternalServerError(e.into()))
     } else {
         json_response(StatusCode::BAD_REQUEST, "query is not supported")
     }
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 9279002eb3..cfeec5622b 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -3,9 +3,18 @@
 use crate::{config::MetricCollectionConfig, http};
 use chrono::{DateTime, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
-use serde::Serialize;
-use std::{collections::HashMap, convert::Infallible, time::Duration};
-use tracing::{error, info, instrument, trace, warn};
+use dashmap::{mapref::entry::Entry, DashMap};
+use once_cell::sync::Lazy;
+use serde::{Deserialize, Serialize};
+use std::{
+    convert::Infallible,
+    sync::{
+        atomic::{AtomicU64, AtomicUsize, Ordering},
+        Arc,
+    },
+    time::Duration,
+};
+use tracing::{error, info, instrument, trace};
 
 const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
 
@@ -18,12 +27,95 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
 /// Both the proxy and the ingestion endpoint will live in the same region (or cell)
 /// so while the project-id is unique across regions the whole pipeline will work correctly
 /// because we enrich the event with project_id in the control-plane endpoint.
-#[derive(Eq, Hash, PartialEq, Serialize, Debug, Clone)]
+#[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)]
 pub struct Ids {
     pub endpoint_id: String,
     pub branch_id: String,
 }
 
+#[derive(Debug)]
+pub struct MetricCounter {
+    transmitted: AtomicU64,
+    opened_connections: AtomicUsize,
+}
+
+impl MetricCounter {
+    /// Record that some bytes were sent from the proxy to the client
+    pub fn record_egress(&self, bytes: u64) {
+        self.transmitted.fetch_add(bytes, Ordering::AcqRel);
+    }
+
+    /// extract the value that should be reported
+    fn should_report(self: &Arc<Self>) -> Option<u64> {
+        // heuristic to see if the branch is still open
+        // if a clone happens while we are observing, the heuristic will be incorrect.
+        //
+        // Worst case is that we won't report an event for this endpoint.
+        // However, for the strong count to be 1 it must have occured that at one instant
+        // all the endpoints were closed, so missing a report because the endpoints are closed is valid.
+        let is_open = Arc::strong_count(self) > 1;
+        let opened = self.opened_connections.swap(0, Ordering::AcqRel);
+
+        // update cached metrics eagerly, even if they can't get sent
+        // (to avoid sending the same metrics twice)
+        // see the relevant discussion on why to do so even if the status is not success:
+        // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956
+        let value = self.transmitted.swap(0, Ordering::AcqRel);
+
+        // Our only requirement is that we report in every interval if there was an open connection
+        // if there were no opened connections since, then we don't need to report
+        if value == 0 && !is_open && opened == 0 {
+            None
+        } else {
+            Some(value)
+        }
+    }
+
+    /// Determine whether the counter should be cleared from the global map.
+    fn should_clear(self: &mut Arc<Self>) -> bool {
+        // we can't clear this entry if it's acquired elsewhere
+        let Some(counter) = Arc::get_mut(self) else {
+            return false;
+        };
+        let opened = *counter.opened_connections.get_mut();
+        let value = *counter.transmitted.get_mut();
+        // clear if there's no data to report
+        value == 0 && opened == 0
+    }
+}
+
+// endpoint and branch IDs are not user generated so we don't run the risk of hash-dos
+type FastHasher = std::hash::BuildHasherDefault<rustc_hash::FxHasher>;
+
+#[derive(Default)]
+pub struct Metrics {
+    endpoints: DashMap<Ids, Arc<MetricCounter>, FastHasher>,
+}
+
+impl Metrics {
+    /// Register a new byte metrics counter for this endpoint
+    pub fn register(&self, ids: Ids) -> Arc<MetricCounter> {
+        let entry = if let Some(entry) = self.endpoints.get(&ids) {
+            entry.clone()
+        } else {
+            self.endpoints
+                .entry(ids)
+                .or_insert_with(|| {
+                    Arc::new(MetricCounter {
+                        transmitted: AtomicU64::new(0),
+                        opened_connections: AtomicUsize::new(0),
+                    })
+                })
+                .clone()
+        };
+
+        entry.opened_connections.fetch_add(1, Ordering::AcqRel);
+        entry
+    }
+}
+
+pub static USAGE_METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
+
 pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infallible> {
     info!("metrics collector config: {config:?}");
     scopeguard::defer! {
@@ -31,145 +123,83 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
     }
 
     let http_client = http::new_client_with_timeout(DEFAULT_HTTP_REPORTING_TIMEOUT);
-    let mut cached_metrics: HashMap<Ids, (u64, DateTime<Utc>)> = HashMap::new();
     let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
 
+    let mut prev = Utc::now();
     let mut ticker = tokio::time::interval(config.interval);
     loop {
         ticker.tick().await;
 
-        let res = collect_metrics_iteration(
+        let now = Utc::now();
+        collect_metrics_iteration(
+            &USAGE_METRICS,
             &http_client,
-            &mut cached_metrics,
             &config.endpoint,
             &hostname,
+            prev,
+            now,
         )
         .await;
-
-        match res {
-            Err(e) => error!("failed to send consumption metrics: {e} "),
-            Ok(_) => trace!("periodic metrics collection completed successfully"),
-        }
+        prev = now;
     }
 }
 
-fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
-    let mut current_metrics: Vec<(Ids, (u64, DateTime<Utc>))> = Vec::new();
-    let metrics = prometheus::default_registry().gather();
-
-    for m in metrics {
-        if m.get_name() == "proxy_io_bytes_per_client" {
-            for ms in m.get_metric() {
-                let direction = ms
-                    .get_label()
-                    .iter()
-                    .find(|l| l.get_name() == "direction")
-                    .unwrap()
-                    .get_value();
-
-                // Only collect metric for outbound traffic
-                if direction == "tx" {
-                    let endpoint_id = ms
-                        .get_label()
-                        .iter()
-                        .find(|l| l.get_name() == "endpoint_id")
-                        .unwrap()
-                        .get_value();
-                    let branch_id = ms
-                        .get_label()
-                        .iter()
-                        .find(|l| l.get_name() == "branch_id")
-                        .unwrap()
-                        .get_value();
-
-                    let value = ms.get_counter().get_value() as u64;
-
-                    // Report if the metric value is suspiciously large
-                    if value > (1u64 << 40) {
-                        warn!(
-                            "potentially abnormal counter value: branch_id {} endpoint_id {} val: {}",
-                            branch_id, endpoint_id, value
-                        );
-                    }
-
-                    current_metrics.push((
-                        Ids {
-                            endpoint_id: endpoint_id.to_string(),
-                            branch_id: branch_id.to_string(),
-                        },
-                        (value, Utc::now()),
-                    ));
-                }
-            }
-        }
-    }
-
-    current_metrics
-}
-
 #[instrument(skip_all)]
 async fn collect_metrics_iteration(
+    metrics: &Metrics,
     client: &http::ClientWithMiddleware,
-    cached_metrics: &mut HashMap<Ids, (u64, DateTime<Utc>)>,
     metric_collection_endpoint: &reqwest::Url,
     hostname: &str,
-) -> anyhow::Result<()> {
+    prev: DateTime<Utc>,
+    now: DateTime<Utc>,
+) {
     info!(
         "starting collect_metrics_iteration. metric_collection_endpoint: {}",
         metric_collection_endpoint
     );
 
-    let current_metrics = gather_proxy_io_bytes_per_client();
+    let mut metrics_to_clear = Vec::new();
 
-    let metrics_to_send: Vec<Event<Ids, &'static str>> = current_metrics
+    let metrics_to_send: Vec<(Ids, u64)> = metrics
+        .endpoints
         .iter()
-        .filter_map(|(curr_key, (curr_val, curr_time))| {
-            let mut start_time = *curr_time;
-            let mut value = *curr_val;
-
-            if let Some((prev_val, prev_time)) = cached_metrics.get(curr_key) {
-                // Only send metrics updates if the metric has increased
-                if curr_val > prev_val {
-                    value = curr_val - prev_val;
-                    start_time = *prev_time;
-                } else {
-                    if curr_val < prev_val {
-                        error!("proxy_io_bytes_per_client metric value decreased from {} to {} for key {:?}",
-                        prev_val, curr_val, curr_key);
-                    }
-                    return None;
-                }
+        .filter_map(|counter| {
+            let key = counter.key().clone();
+            let Some(value) = counter.should_report() else {
+                metrics_to_clear.push(key);
+                return None;
             };
-
-            Some(Event {
-                kind: EventType::Incremental {
-                    start_time,
-                    stop_time: *curr_time,
-                },
-                metric: PROXY_IO_BYTES_PER_CLIENT,
-                idempotency_key: idempotency_key(hostname),
-                value,
-                extra: Ids {
-                    endpoint_id: curr_key.endpoint_id.clone(),
-                    branch_id: curr_key.branch_id.clone(),
-                },
-            })
+            Some((key, value))
         })
         .collect();
 
     if metrics_to_send.is_empty() {
         trace!("no new metrics to send");
-        return Ok(());
     }
 
     // Send metrics.
     // Split into chunks of 1000 metrics to avoid exceeding the max request size
     for chunk in metrics_to_send.chunks(CHUNK_SIZE) {
+        let events = chunk
+            .iter()
+            .map(|(ids, value)| Event {
+                kind: EventType::Incremental {
+                    start_time: prev,
+                    stop_time: now,
+                },
+                metric: PROXY_IO_BYTES_PER_CLIENT,
+                idempotency_key: idempotency_key(hostname),
+                value: *value,
+                extra: Ids {
+                    endpoint_id: ids.endpoint_id.clone(),
+                    branch_id: ids.branch_id.clone(),
+                },
+            })
+            .collect();
+
         let res = client
             .post(metric_collection_endpoint.clone())
-            .json(&EventChunk {
-                events: chunk.into(),
-            })
+            .json(&EventChunk { events })
             .send()
             .await;
 
@@ -183,34 +213,113 @@ async fn collect_metrics_iteration(
 
         if !res.status().is_success() {
             error!("metrics endpoint refused the sent metrics: {:?}", res);
-            for metric in chunk.iter().filter(|metric| metric.value > (1u64 << 40)) {
+            for metric in chunk.iter().filter(|(_, value)| *value > (1u64 << 40)) {
                 // Report if the metric value is suspiciously large
                 error!("potentially abnormal metric value: {:?}", metric);
             }
         }
-        // update cached metrics after they were sent
-        // (to avoid sending the same metrics twice)
-        // see the relevant discussion on why to do so even if the status is not success:
-        // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956
-        for send_metric in chunk {
-            let stop_time = match send_metric.kind {
-                EventType::Incremental { stop_time, .. } => stop_time,
-                _ => unreachable!(),
-            };
+    }
 
-            cached_metrics
-                .entry(Ids {
-                    endpoint_id: send_metric.extra.endpoint_id.clone(),
-                    branch_id: send_metric.extra.branch_id.clone(),
-                })
-                // update cached value (add delta) and time
-                .and_modify(|e| {
-                    e.0 = e.0.saturating_add(send_metric.value);
-                    e.1 = stop_time
-                })
-                // cache new metric
-                .or_insert((send_metric.value, stop_time));
+    for metric in metrics_to_clear {
+        match metrics.endpoints.entry(metric) {
+            Entry::Occupied(mut counter) => {
+                if counter.get_mut().should_clear() {
+                    counter.remove_entry();
+                }
+            }
+            Entry::Vacant(_) => {}
         }
     }
-    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{
+        net::TcpListener,
+        sync::{Arc, Mutex},
+    };
+
+    use anyhow::Error;
+    use chrono::Utc;
+    use consumption_metrics::{Event, EventChunk};
+    use hyper::{
+        service::{make_service_fn, service_fn},
+        Body, Response,
+    };
+    use url::Url;
+
+    use super::{collect_metrics_iteration, Ids, Metrics};
+    use crate::http;
+
+    #[tokio::test]
+    async fn metrics() {
+        let listener = TcpListener::bind("0.0.0.0:0").unwrap();
+
+        let reports = Arc::new(Mutex::new(vec![]));
+        let reports2 = reports.clone();
+
+        let server = hyper::server::Server::from_tcp(listener)
+            .unwrap()
+            .serve(make_service_fn(move |_| {
+                let reports = reports.clone();
+                async move {
+                    Ok::<_, Error>(service_fn(move |req| {
+                        let reports = reports.clone();
+                        async move {
+                            let bytes = hyper::body::to_bytes(req.into_body()).await?;
+                            let events: EventChunk<'static, Event<Ids, String>> =
+                                serde_json::from_slice(&bytes)?;
+                            reports.lock().unwrap().push(events);
+                            Ok::<_, Error>(Response::new(Body::from(vec![])))
+                        }
+                    }))
+                }
+            }));
+        let addr = server.local_addr();
+        tokio::spawn(server);
+
+        let metrics = Metrics::default();
+        let client = http::new_client();
+        let endpoint = Url::parse(&format!("http://{addr}")).unwrap();
+        let now = Utc::now();
+
+        // no counters have been registered
+        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
+        let r = std::mem::take(&mut *reports2.lock().unwrap());
+        assert!(r.is_empty());
+
+        // register a new counter
+        let counter = metrics.register(Ids {
+            endpoint_id: "e1".to_string(),
+            branch_id: "b1".to_string(),
+        });
+
+        // the counter should be observed despite 0 egress
+        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
+        let r = std::mem::take(&mut *reports2.lock().unwrap());
+        assert_eq!(r.len(), 1);
+        assert_eq!(r[0].events.len(), 1);
+        assert_eq!(r[0].events[0].value, 0);
+
+        // record egress
+        counter.record_egress(1);
+
+        // egress should be observered
+        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
+        let r = std::mem::take(&mut *reports2.lock().unwrap());
+        assert_eq!(r.len(), 1);
+        assert_eq!(r[0].events.len(), 1);
+        assert_eq!(r[0].events[0].value, 1);
+
+        // release counter
+        drop(counter);
+
+        // we do not observe the counter
+        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
+        let r = std::mem::take(&mut *reports2.lock().unwrap());
+        assert!(r.is_empty());
+
+        // counter is unregistered
+        assert!(metrics.endpoints.is_empty());
+    }
 }
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index f9da145859..c8f534b2b7 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -7,6 +7,7 @@ use crate::{
     compute::{self, PostgresConnection},
     config::{ProxyConfig, TlsConfig},
     console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
+    metrics::{Ids, USAGE_METRICS},
     protocol2::WithClientIp,
     stream::{PqStream, Stream},
 };
@@ -602,6 +603,11 @@ pub async fn proxy_pass(
     compute: impl AsyncRead + AsyncWrite + Unpin,
     aux: &MetricsAuxInfo,
 ) -> anyhow::Result<()> {
+    let usage = USAGE_METRICS.register(Ids {
+        endpoint_id: aux.endpoint_id.to_string(),
+        branch_id: aux.branch_id.to_string(),
+    });
+
     let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&aux.traffic_labels("tx"));
     let mut client = MeasuredStream::new(
         client,
@@ -609,6 +615,7 @@ pub async fn proxy_pass(
         |cnt| {
             // Number of bytes we sent to the client (outbound).
             m_sent.inc_by(cnt as u64);
+            usage.record_egress(cnt as u64);
         },
     );
 

From b497d0094e3b6cd63d6a75db7823b91d4a8650e0 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Thu, 28 Sep 2023 06:47:44 -0700
Subject: [PATCH 13/24] file cache: Remove free space monitor (#5406)

This effectively reverts #3832.

There's a couple issues we just discovered with the free space monitor,
and to my knowledge, the fact we're putting the file cache on a separate
filesystem (even when on disk) that's guaranteed to have more room than
the maximum size means that this free space monitor should have no
effect.

More details:

1. The control plane sets the maximum file cache size based on max CU
2. The control plane sets the size of the filesystem underlying the file
cache based on the maximum user selectable CU (or, if the endpoint is
larger, then that size), so that there's always enough room
3. If postmaster gets SIGKILL'd, then the free space monitor process
does not exit
4. If the free space monitor is acting on the cache file but not subject
to locking or up-to-date metadata from a newer postgres instance, then
this could lead to data corruption.

So, in practice I belive the risk of data corruption is *low* but not
nothing, and given the issues we hit because of (3), and given that this
the free space monitor shouldn't be necessary because of (1) and (2),
it's best to just remove it outright.

See also: neondatabase/autoscaling#534, #5405
---
 pgxn/neon/file_cache.c | 99 ------------------------------------------
 1 file changed, 99 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 4be75e1dad..22f20a4c0b 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -14,7 +14,6 @@
  */
 
 #include <sys/file.h>
-#include <sys/statvfs.h>
 #include <unistd.h>
 #include <fcntl.h>
 
@@ -38,9 +37,6 @@
 #include "storage/fd.h"
 #include "storage/pg_shmem.h"
 #include "storage/buf_internals.h"
-#include "storage/procsignal.h"
-#include "postmaster/bgworker.h"
-#include "postmaster/interrupt.h"
 
 /*
  * Local file cache is used to temporary store relations pages in local file system.
@@ -66,9 +62,6 @@
 
 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
 
-#define MAX_MONITOR_INTERVAL_USEC 1000000 /* 1 second */
-#define MAX_DISK_WRITE_RATE       1000 /* MB/sec */
-
 typedef struct FileCacheEntry
 {
 	BufferTag	key;
@@ -91,14 +84,12 @@ static int   lfc_desc = 0;
 static LWLockId lfc_lock;
 static int   lfc_max_size;
 static int   lfc_size_limit;
-static int   lfc_free_space_watermark;
 static char* lfc_path;
 static  FileCacheControl* lfc_ctl;
 static shmem_startup_hook_type prev_shmem_startup_hook;
 #if PG_VERSION_NUM>=150000
 static shmem_request_hook_type prev_shmem_request_hook;
 #endif
-static int   lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark is reached */
 
 void FileCacheMonitorMain(Datum main_arg);
 
@@ -254,80 +245,6 @@ lfc_change_limit_hook(int newval, void *extra)
 	LWLockRelease(lfc_lock);
 }
 
-/*
- * Local file system state monitor check available free space.
- * If it is lower than lfc_free_space_watermark then we shrink size of local cache
- * but throwing away least recently accessed chunks.
- * First time low space watermark is reached cache size is divided by two,
- * second time by four,... Finally we remove all chunks from local cache.
- *
- * Please notice that we are not changing lfc_cache_size: it is used to be adjusted by autoscaler.
- * We only throw away cached chunks but do not prevent from filling cache by new chunks.
- *
- * Interval of poooling cache state is calculated as minimal time needed to consume lfc_free_space_watermark
- * disk space with maximal possible disk write speed (1Gb/sec). But not larger than 1 second.
- * Calling statvfs each second should not add any noticeable overhead.
- */
-void
-FileCacheMonitorMain(Datum main_arg)
-{
-	/*
-	 * Choose file system state monitor interval so that space can not be exosted
-	 * during this period but not longer than  MAX_MONITOR_INTERVAL (10 sec)
-	 */
-	uint64 monitor_interval = Min(MAX_MONITOR_INTERVAL_USEC, lfc_free_space_watermark*MB/MAX_DISK_WRITE_RATE);
-
-	/* Establish signal handlers. */
-	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
-	pqsignal(SIGHUP, SignalHandlerForConfigReload);
-	pqsignal(SIGTERM, SignalHandlerForShutdownRequest);
-	BackgroundWorkerUnblockSignals();
-
-	/* Periodically dump buffers until terminated. */
-	while (!ShutdownRequestPending)
-	{
-		if (lfc_size_limit != 0)
-		{
-			struct statvfs sfs;
-			if (statvfs(lfc_path, &sfs) < 0)
-			{
-				elog(WARNING, "Failed to obtain status of %s: %m", lfc_path);
-			}
-			else
-			{
-				if (sfs.f_bavail*sfs.f_bsize < lfc_free_space_watermark*MB)
-				{
-					if (lfc_shrinking_factor < 31) {
-						lfc_shrinking_factor += 1;
-					}
-					lfc_change_limit_hook(lfc_size_limit >> lfc_shrinking_factor, NULL);
-				}
-				else
-					lfc_shrinking_factor = 0; /* reset to initial value */
-			}
-		}
-		pg_usleep(monitor_interval);
-	}
-}
-
-static void
-lfc_register_free_space_monitor(void)
-{
-	BackgroundWorker bgw;
-	memset(&bgw, 0, sizeof(bgw));
-	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
-	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
-	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
-	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "FileCacheMonitorMain");
-	snprintf(bgw.bgw_name, BGW_MAXLEN, "Local free space monitor");
-	snprintf(bgw.bgw_type, BGW_MAXLEN, "Local free space monitor");
-	bgw.bgw_restart_time = 5;
-	bgw.bgw_notify_pid = 0;
-	bgw.bgw_main_arg = (Datum) 0;
-
-	RegisterBackgroundWorker(&bgw);
-}
-
 void
 lfc_init(void)
 {
@@ -364,19 +281,6 @@ lfc_init(void)
 							lfc_change_limit_hook,
 							NULL);
 
-	DefineCustomIntVariable("neon.free_space_watermark",
-							"Minimal free space in local file system after reaching which local file cache will be truncated",
-							NULL,
-							&lfc_free_space_watermark,
-							1024, /* 1GB */
-							0,
-							INT_MAX,
-							PGC_SIGHUP,
-							GUC_UNIT_MB,
-							NULL,
-							NULL,
-							NULL);
-
 	DefineCustomStringVariable("neon.file_cache_path",
 							   "Path to local file cache (can be raw device)",
 							   NULL,
@@ -391,9 +295,6 @@ lfc_init(void)
 	if (lfc_max_size == 0)
 		return;
 
-	if (lfc_free_space_watermark != 0)
-		lfc_register_free_space_monitor();
-
 	prev_shmem_startup_hook = shmem_startup_hook;
 	shmem_startup_hook = lfc_shmem_startup;
 #if PG_VERSION_NUM>=150000

From ca3ca2bb9c92bf0e0b6f7d056f9545958a180ae1 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 28 Sep 2023 17:20:34 +0100
Subject: [PATCH 14/24] pageserver: don't try and recover deletion queue if no
 remote storage (#5419)

## Problem

Because `neon_local` by default runs with no remote storage, it was not
running the deletion queue workers, and the attempt to call into
`recover()` was failing.

This is a bogus configuration that will go away when we make remote
storage mandatory.

## Summary of changes

Don't try and do deletion queue recovery when remote storage is
disabled.

The reason we don't just unset `control_plane_api` to avoid this is that
generations will soon become mandatory, irrespective of when we make
remote storage mandatory.
---
 pageserver/src/tenant/mgr.rs | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 6f3863dd4b..17bcc9eb5f 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -130,10 +130,15 @@ pub async fn init_tenant_mgr(
         // deletion list entries may still be valid.  We provide that by pushing a recovery operation into
         // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
         // are processed, even though we don't block on recovery completing here.
-        resources
-            .deletion_queue_client
-            .recover(result.clone())
-            .await?;
+        //
+        // Must only do this if remote storage is enabled, otherwise deletion queue
+        // is not running and channel push will fail.
+        if resources.remote_storage.is_some() {
+            resources
+                .deletion_queue_client
+                .recover(result.clone())
+                .await?;
+        }
 
         Some(result)
     } else {

From 1881373ec48fb4002eab76bc5f1a5d2d47c729cf Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 28 Sep 2023 17:34:51 +0100
Subject: [PATCH 15/24] Update CODEOWNERS (#5421)

It is usually not intended to notify a random member of the compute team
for pageserver PRs.

Leaving the notification of the storage team in place, because this
serves a purpose when some external contributor opens a PR and isn't
sure who to ask.
---
 CODEOWNERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index b8ca54bc7e..e384dc39f1 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -5,7 +5,7 @@
 /libs/remote_storage/ @neondatabase/storage
 /libs/safekeeper_api/ @neondatabase/safekeepers
 /libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
-/pageserver/ @neondatabase/compute @neondatabase/storage
+/pageserver/ @neondatabase/storage
 /pgxn/ @neondatabase/compute
 /proxy/ @neondatabase/proxy
 /safekeeper/ @neondatabase/safekeepers

From 6a1903987ac9e700ab24f8dd22e2ea22b30c87e7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 29 Sep 2023 09:15:43 +0100
Subject: [PATCH 16/24] tests: use approximate equality in
 test_get_tenant_size_with_multiple_branches (#5411)

## Problem

This test has been flaky for a long time.

As far as I can tell, the test was simply wrong to expect postgres
activity to result in deterministic sizes: making the match fuzzy is not
a hack, it's just matching the reality that postgres doesn't promise to
write exactly the same number of pages every time it runs a given query.

## Summary of changes

Equalities now tolerate up to 4 pages different. This is big enough to
tolerate the deltas we've seen in practice.

Closes: https://github.com/neondatabase/neon/issues/2962
---
 test_runner/regress/test_tenant_size.py | 27 ++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 49a6ca5a53..7cea301a9c 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -15,7 +15,7 @@ from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
     wait_until_tenant_active,
 )
-from fixtures.pg_version import PgVersion, xfail_on_postgres
+from fixtures.pg_version import PgVersion
 from fixtures.types import Lsn, TenantId, TimelineId
 
 
@@ -532,7 +532,24 @@ def test_single_branch_get_tenant_size_grows(
     assert size_after == prev, "size after restarting pageserver should not have changed"
 
 
-@xfail_on_postgres(PgVersion.V15, reason="Test significantly more flaky on Postgres 15")
+def assert_size_approx_equal(size_a, size_b):
+    """
+    Tests that evaluate sizes are checking the pageserver space consumption
+    that sits many layers below the user input.  The exact space needed
+    varies slightly depending on postgres behavior.
+
+    Rather than expecting postgres to be determinstic and occasionally
+    failing the test, we permit sizes for the same data to vary by a few pages.
+    """
+
+    # Determined empirically from examples of equality failures: they differ
+    # by page multiples of 8272, and usually by 1-3 pages.  Tolerate 4 to avoid
+    # failing on outliers from that observed range.
+    threshold = 4 * 8272
+
+    assert size_a == pytest.approx(size_b, abs=threshold)
+
+
 def test_get_tenant_size_with_multiple_branches(
     neon_env_builder: NeonEnvBuilder, test_output_dir: Path
 ):
@@ -573,7 +590,7 @@ def test_get_tenant_size_with_multiple_branches(
     )
 
     size_after_first_branch = http_client.tenant_size(tenant_id)
-    assert size_after_first_branch == size_at_branch
+    assert_size_approx_equal(size_after_first_branch, size_at_branch)
 
     first_branch_endpoint = env.endpoints.create_start("first-branch", tenant_id=tenant_id)
 
@@ -599,7 +616,7 @@ def test_get_tenant_size_with_multiple_branches(
         "second-branch", main_branch_name, tenant_id
     )
     size_after_second_branch = http_client.tenant_size(tenant_id)
-    assert size_after_second_branch == size_after_continuing_on_main
+    assert_size_approx_equal(size_after_second_branch, size_after_continuing_on_main)
 
     second_branch_endpoint = env.endpoints.create_start("second-branch", tenant_id=tenant_id)
 
@@ -635,7 +652,7 @@ def test_get_tenant_size_with_multiple_branches(
     # tenant_size but so far this has been reliable, even though at least gc
     # and tenant_size race for the same locks
     size_after = http_client.tenant_size(tenant_id)
-    assert size_after == size_after_thinning_branch
+    assert_size_approx_equal(size_after, size_after_thinning_branch)
 
     size_debug_file_before = open(test_output_dir / "size_debug_before.html", "w")
     size_debug = http_client.tenant_size_debug(tenant_id)

From fd20bbc6cbed7158e8756fd850d2750bd99b6647 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 29 Sep 2023 09:40:27 +0100
Subject: [PATCH 17/24] proxy: log params when no endpoint (#5418)

## Problem

Our SNI error dashboard features IP addresses but it's not immediately
clear who that is still (#5369)

## Summary of changes

Log some startup params with this error
---
 proxy/src/auth/backend.rs | 13 +++++++++++++
 proxy/src/proxy.rs        |  9 ++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index ff73f2b625..03c9029862 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -160,6 +160,19 @@ impl BackendType<'_, ClientCredentials<'_>> {
             Test(_) => Some("test".to_owned()),
         }
     }
+
+    /// Get username from the credentials.
+    pub fn get_user(&self) -> &str {
+        use BackendType::*;
+
+        match self {
+            Console(_, creds) => creds.user,
+            Postgres(_, creds) => creds.user,
+            Link(_) => "link",
+            Test(_) => "test",
+        }
+    }
+
     /// Authenticate the client via the requested backend, possibly using credentials.
     #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
     pub async fn authenticate(
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index c8f534b2b7..71e00ed58f 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -697,7 +697,14 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
             .await
         {
             Ok(auth_result) => auth_result,
-            Err(e) => return stream.throw_error(e).await,
+            Err(e) => {
+                let user = creds.get_user();
+                let db = params.get("database");
+                let app = params.get("application_name");
+                let params_span = tracing::info_span!("", ?user, ?db, ?app);
+
+                return stream.throw_error(e).instrument(params_span).await;
+            }
         };
 
         let AuthSuccess {

From 86dd28d4fbb48de11df9e56ff8e77121e4fceca1 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 29 Sep 2023 12:57:45 +0100
Subject: [PATCH 18/24] Bump hermit-abi & num_cpus packages (#5427)

## Problem

I've noticed that `hermit-abi`
0.3.1 [1] has been yanked from crates.io (looks like nothing too
bad [2]).
Also, we have 2 versions of `hermit-api` in dependencies (0.3.* and
0.2.*), update `num-cpus` to use the latest `hermit-api` 0.3.3.

- [1] https://crates.io/crates/hermit-abi/0.3.1
- [2] https://github.com/hermit-os/hermit-rs/issues/436

## Summary of changes
- `cargo update -p num-cpus`
- `cargo update -p hermit-abi`
- Unignore `RUSTSEC-2023-0052` in `deny.toml` (it has been fixed in
https://github.com/neondatabase/neon/pull/5069)
---
 Cargo.lock | 23 +++++++----------------
 deny.toml  |  2 +-
 2 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b22f081bdc..36e7069eb1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1780,18 +1780,9 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
 
 [[package]]
 name = "hermit-abi"
-version = "0.2.6"
+version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "hermit-abi"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
+checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
 
 [[package]]
 name = "hex"
@@ -2053,7 +2044,7 @@ version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
 dependencies = [
- "hermit-abi 0.3.1",
+ "hermit-abi",
  "libc",
  "windows-sys 0.48.0",
 ]
@@ -2070,7 +2061,7 @@ version = "0.4.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
 dependencies = [
- "hermit-abi 0.3.1",
+ "hermit-abi",
  "io-lifetimes",
  "rustix 0.37.19",
  "windows-sys 0.48.0",
@@ -2444,11 +2435,11 @@ dependencies = [
 
 [[package]]
 name = "num_cpus"
-version = "1.15.0"
+version = "1.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b"
+checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
 dependencies = [
- "hermit-abi 0.2.6",
+ "hermit-abi",
  "libc",
 ]
 
diff --git a/deny.toml b/deny.toml
index 55c581ce3a..f4ea0d4dac 100644
--- a/deny.toml
+++ b/deny.toml
@@ -23,7 +23,7 @@ vulnerability = "deny"
 unmaintained = "warn"
 yanked = "warn"
 notice = "warn"
-ignore = ["RUSTSEC-2023-0052"]
+ignore = []
 
 # This section is considered when running `cargo deny check licenses`
 # More documentation for the licenses section can be found here:

From c07eef8ea5fd2aceb93e3aba1150e979c3810ea8 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 29 Sep 2023 20:03:56 +0200
Subject: [PATCH 19/24] page_cache: find_victim: don't spin while there's no
 chance for a slot (#5319)

It is wasteful to cycle through the page cache slots trying to find a
victim slot if all the slots are currently un-evictable because a read /
write guard is alive.

We suspect this wasteful cycling to be the root cause for an
"indigestion" we observed in staging (#5291).
The hypothesis is that we `.await` after we get ahold of a read / write
guard, and that tokio actually deschedules us in favor of another
future.
If that other future then needs a page slot, it can't get ours because
we're holding the guard.
Repeat this, and eventually, the other future(s) will find themselves
doing `find_victim` until they hit `exceeded evict iter limit`.

The `find_victim` is wasteful and CPU-starves the futures that are
already holding the read/write guard. A `yield` inside `find_victim`
could mitigate the starvation, but wouldn't fix the wasting of CPU
cycles.

So instead, this PR queues waiters behind a tokio semaphore that counts
evictable slots.
The downside is that this stops the clock page replacement if we have 0
evictable slots.

Also, as explained by the big block comment in `find_victims`, the
semaphore doesn't fully prevent starvation because because we can't make
tokio prioritize those tasks executing `find_victim` that have been
trying the longest.

Implementation
===============
We need to acquire the semaphore permit before locking the slot.
Otherwise, we could deadlock / discover that all permits are gone and
would have to relinquish the slot, having moved forward the Clock LRU
without making progress.

The downside is that, we never get full throughput for read-heavy
workloads, because, until the reader coalesces onto an existing permit,
it'll hold its own permit.


Addendum To Root-Cause Analysis In #5291
========================================

Since merging that PR, @arpad-m pointed out that we couldn't have
reached the `slot.write().await` with his patches because the
VirtualFile slots can't have all been write-locked, because we only hold
them locked while the IO is ongoing, and the IO is still done with
synchronous system calls in that patch set, so, we can have had at most
$number_of_executor_threads locked at any given time.
I count 3 tokio runtimes that do `Timeline::get`, each with 8 executor
threads in our deployment => $number_of_executor_threads = 3*8 = 24 .
But the virtual file cache has 100 slots.

We both agree that nothing changed about the core hypothesis, i.e.,
additional await points inside VirtualFile caused higher concurrency
resulting in exhaustion of page cache slots.
But we'll need to reproduce the issue and investigate further to truly
understand the root cause, or find out that & why we were indeed using
100 VirtualFile slots.

TODO: could it be compaction that needs to hold guards of many
VirtualFile's in its iterators?
---
 pageserver/src/metrics.rs    |  40 ++++++++
 pageserver/src/page_cache.rs | 194 ++++++++++++++++++++++++++++++-----
 2 files changed, 211 insertions(+), 23 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index de94eb8152..f85f525630 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -264,6 +264,46 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
     },
 });
 
+pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_page_cache_acquire_pinned_slot_seconds",
+        "Time spent acquiring a pinned slot in the page cache",
+        CRITICAL_OP_BUCKETS.into(),
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_page_cache_find_victim_iters_total",
+        "Counter for the number of iterations in the find_victim loop",
+    )
+    .expect("failed to define a metric")
+});
+
+static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "page_cache_errors_total",
+        "Number of timeouts while acquiring a pinned slot in the page cache",
+        &["error_kind"]
+    )
+    .expect("failed to define a metric")
+});
+
+#[derive(IntoStaticStr)]
+#[strum(serialize_all = "kebab_case")]
+pub(crate) enum PageCacheErrorKind {
+    AcquirePinnedSlotTimeout,
+    EvictIterLimit,
+}
+
+pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) {
+    PAGE_CACHE_ERRORS
+        .get_metric_with_label_values(&[error_kind.into()])
+        .unwrap()
+        .inc();
+}
+
 pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_wait_lsn_seconds",
diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs
index 38b169ea85..97ca2bfea7 100644
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -75,7 +75,11 @@
 use std::{
     collections::{hash_map::Entry, HashMap},
     convert::TryInto,
-    sync::atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
+    sync::{
+        atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
+        Arc, Weak,
+    },
+    time::Duration,
 };
 
 use anyhow::Context;
@@ -165,6 +169,8 @@ struct Slot {
 
 struct SlotInner {
     key: Option<CacheKey>,
+    // for `coalesce_readers_permit`
+    permit: std::sync::Mutex<Weak<PinnedSlotsPermit>>,
     buf: &'static mut [u8; PAGE_SZ],
 }
 
@@ -207,6 +213,22 @@ impl Slot {
     }
 }
 
+impl SlotInner {
+    /// If there is aready a reader, drop our permit and share its permit, just like we share read access.
+    fn coalesce_readers_permit(&self, permit: PinnedSlotsPermit) -> Arc<PinnedSlotsPermit> {
+        let mut guard = self.permit.lock().unwrap();
+        if let Some(existing_permit) = guard.upgrade() {
+            drop(guard);
+            drop(permit);
+            existing_permit
+        } else {
+            let permit = Arc::new(permit);
+            *guard = Arc::downgrade(&permit);
+            permit
+        }
+    }
+}
+
 pub struct PageCache {
     /// This contains the mapping from the cache key to buffer slot that currently
     /// contains the page, if any.
@@ -224,6 +246,8 @@ pub struct PageCache {
     /// The actual buffers with their metadata.
     slots: Box<[Slot]>,
 
+    pinned_slots: Arc<tokio::sync::Semaphore>,
+
     /// Index of the next candidate to evict, for the Clock replacement algorithm.
     /// This is interpreted modulo the page cache size.
     next_evict_slot: AtomicUsize,
@@ -231,23 +255,28 @@ pub struct PageCache {
     size_metrics: &'static PageCacheSizeMetrics,
 }
 
+struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
+
 ///
 /// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
 /// until the guard is dropped.
 ///
-pub struct PageReadGuard<'i>(tokio::sync::RwLockReadGuard<'i, SlotInner>);
+pub struct PageReadGuard<'i> {
+    _permit: Arc<PinnedSlotsPermit>,
+    slot_guard: tokio::sync::RwLockReadGuard<'i, SlotInner>,
+}
 
 impl std::ops::Deref for PageReadGuard<'_> {
     type Target = [u8; PAGE_SZ];
 
     fn deref(&self) -> &Self::Target {
-        self.0.buf
+        self.slot_guard.buf
     }
 }
 
 impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
     fn as_ref(&self) -> &[u8; PAGE_SZ] {
-        self.0.buf
+        self.slot_guard.buf
     }
 }
 
@@ -264,6 +293,8 @@ impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
 pub struct PageWriteGuard<'i> {
     inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
 
+    _permit: PinnedSlotsPermit,
+
     // Are the page contents currently valid?
     // Used to mark pages as invalid that are assigned but not yet filled with data.
     valid: bool,
@@ -348,6 +379,10 @@ impl PageCache {
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Option<(Lsn, PageReadGuard)> {
+        let Ok(permit) = self.try_get_pinned_slot_permit().await else {
+            return None;
+        };
+
         crate::metrics::PAGE_CACHE
             .for_ctx(ctx)
             .read_accesses_materialized_page
@@ -362,7 +397,10 @@ impl PageCache {
             lsn,
         };
 
-        if let Some(guard) = self.try_lock_for_read(&mut cache_key).await {
+        if let Some(guard) = self
+            .try_lock_for_read(&mut cache_key, &mut Some(permit))
+            .await
+        {
             if let CacheKey::MaterializedPage {
                 hash_key: _,
                 lsn: available_lsn,
@@ -445,6 +483,29 @@ impl PageCache {
     // "mappings" after this section. But the routines in this section should
     // not require changes.
 
+    async fn try_get_pinned_slot_permit(&self) -> anyhow::Result<PinnedSlotsPermit> {
+        let timer = crate::metrics::PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME.start_timer();
+        match tokio::time::timeout(
+            // Choose small timeout, neon_smgr does its own retries.
+            // https://neondb.slack.com/archives/C04DGM6SMTM/p1694786876476869
+            Duration::from_secs(10),
+            Arc::clone(&self.pinned_slots).acquire_owned(),
+        )
+        .await
+        {
+            Ok(res) => Ok(PinnedSlotsPermit(
+                res.expect("this semaphore is never closed"),
+            )),
+            Err(_timeout) => {
+                timer.stop_and_discard();
+                crate::metrics::page_cache_errors_inc(
+                    crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout,
+                );
+                anyhow::bail!("timeout: there were page guards alive for all page cache slots")
+            }
+        }
+    }
+
     /// Look up a page in the cache.
     ///
     /// If the search criteria is not exact, *cache_key is updated with the key
@@ -454,7 +515,11 @@ impl PageCache {
     ///
     /// If no page is found, returns None and *cache_key is left unmodified.
     ///
-    async fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
+    async fn try_lock_for_read(
+        &self,
+        cache_key: &mut CacheKey,
+        permit: &mut Option<PinnedSlotsPermit>,
+    ) -> Option<PageReadGuard> {
         let cache_key_orig = cache_key.clone();
         if let Some(slot_idx) = self.search_mapping(cache_key) {
             // The page was found in the mapping. Lock the slot, and re-check
@@ -464,7 +529,10 @@ impl PageCache {
             let inner = slot.inner.read().await;
             if inner.key.as_ref() == Some(cache_key) {
                 slot.inc_usage_count();
-                return Some(PageReadGuard(inner));
+                return Some(PageReadGuard {
+                    _permit: inner.coalesce_readers_permit(permit.take().unwrap()),
+                    slot_guard: inner,
+                });
             } else {
                 // search_mapping might have modified the search key; restore it.
                 *cache_key = cache_key_orig;
@@ -507,6 +575,8 @@ impl PageCache {
         cache_key: &mut CacheKey,
         ctx: &RequestContext,
     ) -> anyhow::Result<ReadBufResult> {
+        let mut permit = Some(self.try_get_pinned_slot_permit().await?);
+
         let (read_access, hit) = match cache_key {
             CacheKey::MaterializedPage { .. } => {
                 unreachable!("Materialized pages use lookup_materialized_page")
@@ -523,17 +593,21 @@ impl PageCache {
         let mut is_first_iteration = true;
         loop {
             // First check if the key already exists in the cache.
-            if let Some(read_guard) = self.try_lock_for_read(cache_key).await {
+            if let Some(read_guard) = self.try_lock_for_read(cache_key, &mut permit).await {
+                debug_assert!(permit.is_none());
                 if is_first_iteration {
                     hit.inc();
                 }
                 return Ok(ReadBufResult::Found(read_guard));
             }
+            debug_assert!(permit.is_some());
             is_first_iteration = false;
 
             // Not found. Find a victim buffer
-            let (slot_idx, mut inner) =
-                self.find_victim().context("Failed to find evict victim")?;
+            let (slot_idx, mut inner) = self
+                .find_victim(permit.as_ref().unwrap())
+                .await
+                .context("Failed to find evict victim")?;
 
             // Insert mapping for this. At this point, we may find that another
             // thread did the same thing concurrently. In that case, we evicted
@@ -555,7 +629,16 @@ impl PageCache {
             inner.key = Some(cache_key.clone());
             slot.set_usage_count(1);
 
+            debug_assert!(
+                {
+                    let guard = inner.permit.lock().unwrap();
+                    guard.upgrade().is_none()
+                },
+                "we hold a write lock, so, no one else should have a permit"
+            );
+
             return Ok(ReadBufResult::NotFound(PageWriteGuard {
+                _permit: permit.take().unwrap(),
                 inner,
                 valid: false,
             }));
@@ -566,7 +649,11 @@ impl PageCache {
     /// found, returns None.
     ///
     /// When locking a page for writing, the search criteria is always "exact".
-    async fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
+    async fn try_lock_for_write(
+        &self,
+        cache_key: &CacheKey,
+        permit: &mut Option<PinnedSlotsPermit>,
+    ) -> Option<PageWriteGuard> {
         if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
             // The page was found in the mapping. Lock the slot, and re-check
             // that it's still what we expected (because we don't released the mapping
@@ -575,7 +662,18 @@ impl PageCache {
             let inner = slot.inner.write().await;
             if inner.key.as_ref() == Some(cache_key) {
                 slot.inc_usage_count();
-                return Some(PageWriteGuard { inner, valid: true });
+                debug_assert!(
+                    {
+                        let guard = inner.permit.lock().unwrap();
+                        guard.upgrade().is_none()
+                    },
+                    "we hold a write lock, so, no one else should have a permit"
+                );
+                return Some(PageWriteGuard {
+                    _permit: permit.take().unwrap(),
+                    inner,
+                    valid: true,
+                });
             }
         }
         None
@@ -586,15 +684,20 @@ impl PageCache {
     /// Similar to lock_for_read(), but the returned buffer is write-locked and
     /// may be modified by the caller even if it's already found in the cache.
     async fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
+        let mut permit = Some(self.try_get_pinned_slot_permit().await?);
         loop {
             // First check if the key already exists in the cache.
-            if let Some(write_guard) = self.try_lock_for_write(cache_key).await {
+            if let Some(write_guard) = self.try_lock_for_write(cache_key, &mut permit).await {
+                debug_assert!(permit.is_none());
                 return Ok(WriteBufResult::Found(write_guard));
             }
+            debug_assert!(permit.is_some());
 
             // Not found. Find a victim buffer
-            let (slot_idx, mut inner) =
-                self.find_victim().context("Failed to find evict victim")?;
+            let (slot_idx, mut inner) = self
+                .find_victim(permit.as_ref().unwrap())
+                .await
+                .context("Failed to find evict victim")?;
 
             // Insert mapping for this. At this point, we may find that another
             // thread did the same thing concurrently. In that case, we evicted
@@ -616,7 +719,16 @@ impl PageCache {
             inner.key = Some(cache_key.clone());
             slot.set_usage_count(1);
 
+            debug_assert!(
+                {
+                    let guard = inner.permit.lock().unwrap();
+                    guard.upgrade().is_none()
+                },
+                "we hold a write lock, so, no one else should have a permit"
+            );
+
             return Ok(WriteBufResult::NotFound(PageWriteGuard {
+                _permit: permit.take().unwrap(),
                 inner,
                 valid: false,
             }));
@@ -769,7 +881,10 @@ impl PageCache {
     /// Find a slot to evict.
     ///
     /// On return, the slot is empty and write-locked.
-    fn find_victim(&self) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
+    async fn find_victim(
+        &self,
+        _permit_witness: &PinnedSlotsPermit,
+    ) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
         let iter_limit = self.slots.len() * 10;
         let mut iters = 0;
         loop {
@@ -782,13 +897,40 @@ impl PageCache {
                 let mut inner = match slot.inner.try_write() {
                     Ok(inner) => inner,
                     Err(_err) => {
-                        // If we have looped through the whole buffer pool 10 times
-                        // and still haven't found a victim buffer, something's wrong.
-                        // Maybe all the buffers were in locked. That could happen in
-                        // theory, if you have more threads holding buffers locked than
-                        // there are buffers in the pool. In practice, with a reasonably
-                        // large buffer pool it really shouldn't happen.
                         if iters > iter_limit {
+                            // NB: Even with the permits, there's no hard guarantee that we will find a slot with
+                            // any particular number of iterations: other threads might race ahead and acquire and
+                            // release pins just as we're scanning the array.
+                            //
+                            // Imagine that nslots is 2, and as starting point, usage_count==1 on all
+                            // slots. There are two threads running concurrently, A and B. A has just
+                            // acquired the permit from the semaphore.
+                            //
+                            //   A: Look at slot 1. Its usage_count == 1, so decrement it to zero, and continue the search
+                            //   B: Acquire permit.
+                            //   B: Look at slot 2, decrement its usage_count to zero and continue the search
+                            //   B: Look at slot 1. Its usage_count is zero, so pin it and bump up its usage_count to 1.
+                            //   B: Release pin and permit again
+                            //   B: Acquire permit.
+                            //   B: Look at slot 2. Its usage_count is zero, so pin it and bump up its usage_count to 1.
+                            //   B: Release pin and permit again
+                            //
+                            // Now we're back in the starting situation that both slots have
+                            // usage_count 1, but A has now been through one iteration of the
+                            // find_victim() loop. This can repeat indefinitely and on each
+                            // iteration, A's iteration count increases by one.
+                            //
+                            // So, even though the semaphore for the permits is fair, the victim search
+                            // itself happens in parallel and is not fair.
+                            // Hence even with a permit, a task can theoretically be starved.
+                            // To avoid this, we'd need tokio to give priority to tasks that are holding
+                            // permits for longer.
+                            // Note that just yielding to tokio during iteration without such
+                            // priority boosting is likely counter-productive. We'd just give more opportunities
+                            // for B to bump usage count, further starving A.
+                            crate::metrics::page_cache_errors_inc(
+                                crate::metrics::PageCacheErrorKind::EvictIterLimit,
+                            );
                             anyhow::bail!("exceeded evict iter limit");
                         }
                         continue;
@@ -799,6 +941,7 @@ impl PageCache {
                     self.remove_mapping(old_key);
                     inner.key = None;
                 }
+                crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
                 return Ok((slot_idx, inner));
             }
         }
@@ -826,7 +969,11 @@ impl PageCache {
                 let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();
 
                 Slot {
-                    inner: tokio::sync::RwLock::new(SlotInner { key: None, buf }),
+                    inner: tokio::sync::RwLock::new(SlotInner {
+                        key: None,
+                        buf,
+                        permit: std::sync::Mutex::new(Weak::new()),
+                    }),
                     usage_count: AtomicU8::new(0),
                 }
             })
@@ -838,6 +985,7 @@ impl PageCache {
             slots,
             next_evict_slot: AtomicUsize::new(0),
             size_metrics,
+            pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
         }
     }
 }

From 89275f6c1e7b09504711e9da37fff080bebb5ea8 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Fri, 29 Sep 2023 10:39:28 -0800
Subject: [PATCH 20/24] Fix invalid database resulting from failed DROP DB
 (#5423)

## Problem
If the control plane happened to respond to a DROP DATABASE request with
a non-200 response, we'd abort the DROP DATABASE transaction in the
usual spot. However, Postgres for some reason actually performs the drop
inside of `standard_ProcessUtility`. As such, the database was left in a
weird state after aborting the transaction. We had test coverage of a
failed CREATE DATABASE but not a failed DROP DATABASE.

## Summary of changes
Since DROP DATABASE can't be inside of a transaction block, we can just
forward the DDL changes to the control plane inside of
`ProcessUtility_hook`, and if we respond with 500 bail out of
`ProcessUtility` before we perform the drop. This change also adds a
test, which reproduced the invalid database issue before the fix was
applied.
---
 pgxn/neon/control_plane_connector.c        | 7 +++++++
 test_runner/regress/test_ddl_forwarding.py | 8 ++++++++
 2 files changed, 15 insertions(+)

diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
index debbbce117..8b0035b8e8 100644
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -741,6 +741,13 @@ NeonProcessUtility(
 			break;
 		case T_DropdbStmt:
 			HandleDropDb(castNode(DropdbStmt, parseTree));
+			/*
+			 * We do this here to hack around the fact that Postgres performs the drop
+			 * INSIDE of standard_ProcessUtility, which means that if we try to
+			 * abort the drop normally it'll be too late. DROP DATABASE can't be inside
+			 * of a transaction block anyway, so this should be fine to do.
+			 */
+			NeonXactCallback(XACT_EVENT_PRE_COMMIT, NULL);
 			break;
 		case T_CreateRoleStmt:
 			HandleCreateRole(castNode(CreateRoleStmt, parseTree));
diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py
index 740e489759..d4cf1b4739 100644
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -211,4 +211,12 @@ def test_ddl_forwarding(ddl: DdlForwardingContext):
         ddl.wait()
 
     ddl.failures(False)
+    cur.execute("CREATE DATABASE failure WITH OWNER=cork")
+    ddl.wait()
+    with pytest.raises(psycopg2.InternalError):
+        ddl.failures(True)
+        cur.execute("DROP DATABASE failure")
+        ddl.wait()
+    ddl.pg.connect(dbname="failure")  # Ensure we can connect after a failed drop
+
     conn.close()

From c1dcf61ca29ea40ebd5092936f21b38ae2d5bf3b Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Mon, 2 Oct 2023 16:52:45 +0200
Subject: [PATCH 21/24] Update pgx-ulid extension (#5382)

- Update `pgx-ulid` from 0.1.0 to 0.1.3, and add it to Postgres 16
- Add `pg_tiktoken` to Postgres 16 image

Closes #5374

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 Dockerfile.compute-node | 84 ++++++++++++++++++++++-------------------
 1 file changed, 46 insertions(+), 38 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 55eb9b7411..10bf550b50 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -614,15 +614,11 @@ RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgre
 
 #########################################################################################
 #
-# Layer "rust extensions"
+# Layer "rust extensions" for older extension which hasn't been updated to `pgrx` yet
 # This layer is used to build `pgx` deps
 #
-# FIXME: This needs to be updated to latest version of 'pgrx' (it was renamed from
-# 'pgx' to 'pgrx') for PostgreSQL 16. And that in turn requires bumping the pgx
-# dependency on all the rust extension that depend on it, too.
-#
 #########################################################################################
-FROM build-deps AS rust-extensions-build
+FROM build-deps AS rust-extensions-build-pgx
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN apt-get update && \
@@ -654,6 +650,34 @@ RUN case "${PG_VERSION}" in \
 
 USER root
 
+#########################################################################################
+#
+# Layer "rust extensions"
+# This layer is used to build `pgrx` deps
+#
+#########################################################################################
+FROM build-deps AS rust-extensions-build-pgrx
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN apt-get update && \
+    apt-get install -y curl libclang-dev cmake && \
+    useradd -ms /bin/bash nonroot -b /home
+
+ENV HOME=/home/nonroot
+ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
+USER nonroot
+WORKDIR /home/nonroot
+ARG PG_VERSION
+
+RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
+    chmod +x rustup-init && \
+    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
+    rm rustup-init && \
+    cargo install --locked --version 0.10.2 cargo-pgrx && \
+    /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
+
+USER root
+
 #########################################################################################
 #
 # Layer "pg-jsonschema-pg-build"
@@ -661,7 +685,7 @@ USER root
 #
 #########################################################################################
 
-FROM rust-extensions-build AS pg-jsonschema-pg-build
+FROM rust-extensions-build-pgx AS pg-jsonschema-pg-build
 ARG PG_VERSION
 
 # caeab60d70b2fd3ae421ec66466a3abbb37b7ee6 made on 06/03/2023
@@ -690,7 +714,7 @@ RUN case "${PG_VERSION}" in \
 #
 #########################################################################################
 
-FROM rust-extensions-build AS pg-graphql-pg-build
+FROM rust-extensions-build-pgx AS pg-graphql-pg-build
 ARG PG_VERSION
 
 # b4988843647450a153439be367168ed09971af85 made on 22/02/2023 (from remove-pgx-contrib-spiext branch)
@@ -724,24 +748,14 @@ RUN case "${PG_VERSION}" in \
 #
 #########################################################################################
 
-FROM rust-extensions-build AS pg-tiktoken-pg-build
+FROM rust-extensions-build-pgrx AS pg-tiktoken-pg-build
 ARG PG_VERSION
 
-# 801f84f08c6881c8aa30f405fafbf00eec386a72 made on 10/03/2023
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \
-    echo "52f60ac800993a49aa8c609961842b611b6b1949717b69ce2ec9117117e16e4a pg_tiktoken.tar.gz" | sha256sum --check && \
+# 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
+RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
+    echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
     mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
-    cargo pgx install --release && \
+    cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
 
 #########################################################################################
@@ -751,24 +765,18 @@ RUN case "${PG_VERSION}" in \
 #
 #########################################################################################
 
-FROM rust-extensions-build AS pg-pgx-ulid-build
+FROM rust-extensions-build-pgrx AS pg-pgx-ulid-build
 ARG PG_VERSION
 
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
-    echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \
+    echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \
     mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgx        = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    cargo pgx install --release && \
+    echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \
+    wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
+    patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
+    echo "********************************************************************************************************" && \
+    sed -i 's/pgrx       = "=0.10.2"/pgrx = { version = "=0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
 
 #########################################################################################

From 00369c8c2a50944a730963266a758faf3c8f52e0 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 2 Oct 2023 23:50:27 +0100
Subject: [PATCH 22/24] Update pg_jsonschema & pg_grapgql extensions (#5438)

- Update `pg_jsonschema` to 0.2.0 with Postgres 16 support
- Update `pg_grapgql` to 1.4.0 with Postgres 16 support
- Remove `pgx` (old name of `pgrx`) layer from Dockerfile
---
 Dockerfile.compute-node | 91 ++++++-----------------------------------
 1 file changed, 13 insertions(+), 78 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 10bf550b50..e53ec47688 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -612,51 +612,13 @@ RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgre
     sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
     comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -
 
-#########################################################################################
-#
-# Layer "rust extensions" for older extension which hasn't been updated to `pgrx` yet
-# This layer is used to build `pgx` deps
-#
-#########################################################################################
-FROM build-deps AS rust-extensions-build-pgx
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-RUN apt-get update && \
-    apt-get install -y curl libclang-dev cmake && \
-    useradd -ms /bin/bash nonroot -b /home
-
-ENV HOME=/home/nonroot
-ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
-USER nonroot
-WORKDIR /home/nonroot
-ARG PG_VERSION
-
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version ${PG_VERSION}" && exit 1 \
-        ;; \
-    esac && \
-    curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
-    chmod +x rustup-init && \
-    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
-    rm rustup-init && \
-    cargo install --locked --version 0.7.3 cargo-pgx && \
-    /bin/bash -c 'cargo pgx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
-
-USER root
-
 #########################################################################################
 #
 # Layer "rust extensions"
 # This layer is used to build `pgrx` deps
 #
 #########################################################################################
-FROM build-deps AS rust-extensions-build-pgrx
+FROM build-deps AS rust-extensions-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN apt-get update && \
@@ -685,26 +647,14 @@ USER root
 #
 #########################################################################################
 
-FROM rust-extensions-build-pgx AS pg-jsonschema-pg-build
+FROM rust-extensions-build AS pg-jsonschema-pg-build
 ARG PG_VERSION
 
-# caeab60d70b2fd3ae421ec66466a3abbb37b7ee6 made on 06/03/2023
-# there is no release tag yet, but we need it due to the superuser fix in the control file, switch to git tag after release >= 0.1.5
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version \"${PG_VERSION}\"" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \
-    echo "54129ce2e7ee7a585648dbb4cef6d73f795d94fe72f248ac01119992518469a4 pg_jsonschema.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \
+    echo "b1bd95009c8809bd6cda9a37777f8b7df425ff1a34976c1e7a4b31cf838ace66 pg_jsonschema.tar.gz" | sha256sum --check && \
     mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgx = "0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    cargo pgx install --release && \
+    sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
 
 #########################################################################################
@@ -714,29 +664,14 @@ RUN case "${PG_VERSION}" in \
 #
 #########################################################################################
 
-FROM rust-extensions-build-pgx AS pg-graphql-pg-build
+FROM rust-extensions-build AS pg-graphql-pg-build
 ARG PG_VERSION
 
-# b4988843647450a153439be367168ed09971af85 made on 22/02/2023 (from remove-pgx-contrib-spiext branch)
-# Currently pgx version bump to >= 0.7.2  causes "call to unsafe function" compliation errors in
-# pgx-contrib-spiext. There is a branch that removes that dependency, so use it. It is on the
-# same 1.1 version we've used before.
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \
-    echo "0c7b0e746441b2ec24187d0e03555faf935c2159e2839bddd14df6dafbc8c9bd pg_graphql.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \
+    echo "ea85d45f8af1d2382e2af847f88102f930782c00e6c612308e6f08f27309d5f7 pg_graphql.tar.gz" | sha256sum --check && \
     mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgx = "~0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    sed -i 's/pgx-tests = "~0.7.1"/pgx-tests = "0.7.3"/g' Cargo.toml && \
-    cargo pgx install --release && \
+    sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgrx install --release && \
     # it's needed to enable extension because it uses untrusted C language
     sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control
@@ -748,7 +683,7 @@ RUN case "${PG_VERSION}" in \
 #
 #########################################################################################
 
-FROM rust-extensions-build-pgrx AS pg-tiktoken-pg-build
+FROM rust-extensions-build AS pg-tiktoken-pg-build
 ARG PG_VERSION
 
 # 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
@@ -765,7 +700,7 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6
 #
 #########################################################################################
 
-FROM rust-extensions-build-pgrx AS pg-pgx-ulid-build
+FROM rust-extensions-build AS pg-pgx-ulid-build
 ARG PG_VERSION
 
 RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \

From de0e96d2be85812460ffeddabbd0e5bd6d82b912 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 3 Oct 2023 10:22:11 +0200
Subject: [PATCH 23/24] remote_storage: separate semaphores for read and write
 ops (#5440)

Before this PR, a compaction that queues a lot of uploads could grab all
the semaphore permits.

Any readers that need on-demand downloads would queue up, causing
getpage@lsn outliers.

Internal context:
https://neondb.slack.com/archives/C05NXJFNRPA/p1696264359425419?thread_ts=1696250393.840899&cid=C05NXJFNRPA
---
 libs/remote_storage/src/s3_bucket.rs | 46 +++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 5 deletions(-)

diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index acab953904..fc6d7fa61b 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -47,10 +47,47 @@ pub struct S3Bucket {
     bucket_name: String,
     prefix_in_bucket: Option<String>,
     max_keys_per_list_response: Option<i32>,
+    concurrency_limiter: ConcurrencyLimiter,
+}
+
+struct ConcurrencyLimiter {
     // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
     // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
     // The helps to ensure we don't exceed the thresholds.
-    concurrency_limiter: Arc<Semaphore>,
+    write: Arc<Semaphore>,
+    read: Arc<Semaphore>,
+}
+
+impl ConcurrencyLimiter {
+    fn for_kind(&self, kind: RequestKind) -> &Arc<Semaphore> {
+        match kind {
+            RequestKind::Get => &self.read,
+            RequestKind::Put => &self.write,
+            RequestKind::List => &self.read,
+            RequestKind::Delete => &self.write,
+        }
+    }
+
+    async fn acquire(
+        &self,
+        kind: RequestKind,
+    ) -> Result<tokio::sync::SemaphorePermit<'_>, tokio::sync::AcquireError> {
+        self.for_kind(kind).acquire().await
+    }
+
+    async fn acquire_owned(
+        &self,
+        kind: RequestKind,
+    ) -> Result<tokio::sync::OwnedSemaphorePermit, tokio::sync::AcquireError> {
+        Arc::clone(self.for_kind(kind)).acquire_owned().await
+    }
+
+    fn new(limit: usize) -> ConcurrencyLimiter {
+        Self {
+            read: Arc::new(Semaphore::new(limit)),
+            write: Arc::new(Semaphore::new(limit)),
+        }
+    }
 }
 
 #[derive(Default)]
@@ -117,7 +154,7 @@ impl S3Bucket {
             bucket_name: aws_config.bucket_name.clone(),
             max_keys_per_list_response: aws_config.max_keys_per_list_response,
             prefix_in_bucket,
-            concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())),
+            concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
         })
     }
 
@@ -156,7 +193,7 @@ impl S3Bucket {
         let started_at = start_counting_cancelled_wait(kind);
         let permit = self
             .concurrency_limiter
-            .acquire()
+            .acquire(kind)
             .await
             .expect("semaphore is never closed");
 
@@ -172,8 +209,7 @@ impl S3Bucket {
         let started_at = start_counting_cancelled_wait(kind);
         let permit = self
             .concurrency_limiter
-            .clone()
-            .acquire_owned()
+            .acquire_owned(kind)
             .await
             .expect("semaphore is never closed");
 

From 7222777784eb84efc13093783d9aabbe1260b5ef Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 3 Oct 2023 18:42:39 +0100
Subject: [PATCH 24/24] Update checksums for pg_jsonschema & pg_graphql (#5455)

## Problem

Folks have re-taged releases for `pg_jsonschema` and `pg_graphql` (to
increase timeouts on their CI), for us, these are a noop changes,
but unfortunately, this will cause our builds to fail due to checksums
mismatch (this might not strike right away because of the build cache).
- https://github.com/supabase/pg_jsonschema/commit/8ba7c7be9d3f12c0cf30c7105db303ce2aaf12c2
- https://github.com/supabase/pg_graphql/commit/aa7509370a4d34a26d48126ac24bc937a009c115

## Summary of changes
- `pg_jsonschema` update checksum
- `pg_graphql` update checksum
---
 Dockerfile.compute-node | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index e53ec47688..7e34b66d68 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -651,7 +651,7 @@ FROM rust-extensions-build AS pg-jsonschema-pg-build
 ARG PG_VERSION
 
 RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \
-    echo "b1bd95009c8809bd6cda9a37777f8b7df425ff1a34976c1e7a4b31cf838ace66 pg_jsonschema.tar.gz" | sha256sum --check && \
+    echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \
     mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
@@ -668,7 +668,7 @@ FROM rust-extensions-build AS pg-graphql-pg-build
 ARG PG_VERSION
 
 RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \
-    echo "ea85d45f8af1d2382e2af847f88102f930782c00e6c612308e6f08f27309d5f7 pg_graphql.tar.gz" | sha256sum --check && \
+    echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \
     mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \