From db68e822355a4ef8ac9e3363d90bb9a2bd0e6dad Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 17 Oct 2024 10:06:02 +0100
Subject: [PATCH 01/48] storage_scrubber: fixes to garbage commands (#9409)

## Problem

While running `find-garbage` and `purge-garbage`, I encountered two
things that needed updating:
- Console API may omit `user_id` since org accounts were added
- When we cut over to using GenericRemoteStorage, the object listings we
do during purge did not get proper retry handling, so could easily fail
on usual S3 errors, and make the whole process drop out.

...and one bug:
- We had a `.unwrap` which expects that after finding an object in a
tenant path, a listing in that path will always return objects. This is
not true, because a pageserver might be deleting the path at the same
time as we scan it.

## Summary of changes

- When listing objects during purge, use backoff::retry
- Make `user_id` an `Option`
- Handle the case where a tenant's objects go away during find-garbage.
---
 storage_scrubber/src/cloud_admin_api.rs |  2 +-
 storage_scrubber/src/garbage.rs         | 65 ++++++++++++++++---------
 2 files changed, 42 insertions(+), 25 deletions(-)
diff --git a/storage_scrubber/src/cloud_admin_api.rs b/storage_scrubber/src/cloud_admin_api.rs
index 70b108cf23..7b82a0b116 100644
--- a/storage_scrubber/src/cloud_admin_api.rs
+++ b/storage_scrubber/src/cloud_admin_api.rs
@@ -138,7 +138,7 @@ pub struct ProjectData {
     pub name: String,
     pub region_id: String,
     pub platform_id: String,
-    pub user_id: String,
+    pub user_id: Option<String>,
     pub pageserver_id: Option<u64>,
     #[serde(deserialize_with = "from_nullable_id")]
     pub tenant: TenantId,
diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index d53611ed6e..a0040ada08 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -16,13 +16,13 @@ use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePat
 use serde::{Deserialize, Serialize};
 use tokio_stream::StreamExt;
 use tokio_util::sync::CancellationToken;
-use utils::id::TenantId;
+use utils::{backoff, id::TenantId};
 
 use crate::{
     cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
     init_remote, list_objects_with_retries,
     metadata_stream::{stream_tenant_timelines, stream_tenants},
-    BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
+    BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, MAX_RETRIES,
 };
 
 #[derive(Serialize, Deserialize, Debug)]
@@ -250,13 +250,16 @@ async fn find_garbage_inner(
                     &target.tenant_root(&tenant_shard_id),
                 )
                 .await?;
-                let object = tenant_objects.keys.first().unwrap();
-                if object.key.get_path().as_str().ends_with("heatmap-v1.json") {
-                    tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
-                    garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
-                    continue;
+                if let Some(object) = tenant_objects.keys.first() {
+                    if object.key.get_path().as_str().ends_with("heatmap-v1.json") {
+                        tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
+                        garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
+                        continue;
+                    } else {
+                        tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key);
+                    }
                 } else {
-                    tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key);
+                    tracing::info!("Tenant {tenant_shard_id} is missing in console appears to have been deleted while we ran");
                 }
             } else {
                 // A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial
@@ -406,14 +409,17 @@ pub async fn get_tenant_objects(
     // TODO: apply extra validation based on object modification time.  Don't purge
     // tenants where any timeline's index_part.json has been touched recently.
 
-    let list = s3_client
-        .list(
-            Some(&tenant_root),
-            ListingMode::NoDelimiter,
-            None,
-            &CancellationToken::new(),
-        )
-        .await?;
+    let cancel = CancellationToken::new();
+    let list = backoff::retry(
+        || s3_client.list(Some(&tenant_root), ListingMode::NoDelimiter, None, &cancel),
+        |_| false,
+        3,
+        MAX_RETRIES as u32,
+        "get_tenant_objects",
+        &cancel,
+    )
+    .await
+    .expect("dummy cancellation token")?;
     Ok(list.keys)
 }
 
@@ -424,14 +430,25 @@ pub async fn get_timeline_objects(
     tracing::debug!("Listing objects in timeline {ttid}");
     let timeline_root = super::remote_timeline_path_id(&ttid);
 
-    let list = s3_client
-        .list(
-            Some(&timeline_root),
-            ListingMode::NoDelimiter,
-            None,
-            &CancellationToken::new(),
-        )
-        .await?;
+    let cancel = CancellationToken::new();
+    let list = backoff::retry(
+        || {
+            s3_client.list(
+                Some(&timeline_root),
+                ListingMode::NoDelimiter,
+                None,
+                &cancel,
+            )
+        },
+        |_| false,
+        3,
+        MAX_RETRIES as u32,
+        "get_timeline_objects",
+        &cancel,
+    )
+    .await
+    .expect("dummy cancellation token")?;
+
     Ok(list.keys)
 }
 

From 22d8834474d1f619b6ed351fd80033b4a064bb21 Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Thu, 17 Oct 2024 13:38:24 +0300
Subject: [PATCH 02/48] proxy: move the connection pools to separate file
 (#9398)

First PR for #9284
Start unification of the client and connection pool interfaces:
- Exclude the 'global_connections_count' out from the get_conn_entry()
- Move remote connection pools to the conn_pool_lib as a reference
- Unify clients among all the conn pools
---
 proxy/src/serverless/backend.rs         |  13 +-
 proxy/src/serverless/conn_pool.rs       | 593 ++----------------------
 proxy/src/serverless/conn_pool_lib.rs   | 562 ++++++++++++++++++++++
 proxy/src/serverless/http_conn_pool.rs  |  50 +-
 proxy/src/serverless/local_conn_pool.rs | 111 ++---
 proxy/src/serverless/mod.rs             |   5 +-
 proxy/src/serverless/sql_over_http.rs   |  15 +-
 7 files changed, 709 insertions(+), 640 deletions(-)
 create mode 100644 proxy/src/serverless/conn_pool_lib.rs

diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index a180c4c2ed..82e81dbcfe 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -11,8 +11,9 @@ use tokio::net::{lookup_host, TcpStream};
 use tracing::field::display;
 use tracing::{debug, info};
 
-use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
-use super::http_conn_pool::{self, poll_http2_client};
+use super::conn_pool::poll_client;
+use super::conn_pool_lib::{Client, ConnInfo, GlobalConnPool};
+use super::http_conn_pool::{self, poll_http2_client, Send};
 use super::local_conn_pool::{self, LocalClient, LocalConnPool};
 use crate::auth::backend::local::StaticAuthRules;
 use crate::auth::backend::{ComputeCredentials, ComputeUserInfo};
@@ -31,7 +32,7 @@ use crate::rate_limiter::EndpointRateLimiter;
 use crate::{compute, EndpointId, Host};
 
 pub(crate) struct PoolingBackend {
-    pub(crate) http_conn_pool: Arc<super::http_conn_pool::GlobalConnPool>,
+    pub(crate) http_conn_pool: Arc<super::http_conn_pool::GlobalConnPool<Send>>,
     pub(crate) local_pool: Arc<LocalConnPool<tokio_postgres::Client>>,
     pub(crate) pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
     pub(crate) config: &'static ProxyConfig,
@@ -199,7 +200,7 @@ impl PoolingBackend {
         &self,
         ctx: &RequestMonitoring,
         conn_info: ConnInfo,
-    ) -> Result<http_conn_pool::Client, HttpConnError> {
+    ) -> Result<http_conn_pool::Client<Send>, HttpConnError> {
         info!("pool: looking for an existing connection");
         if let Some(client) = self.http_conn_pool.get(ctx, &conn_info) {
             return Ok(client);
@@ -481,7 +482,7 @@ impl ConnectMechanism for TokioMechanism {
 }
 
 struct HyperMechanism {
-    pool: Arc<http_conn_pool::GlobalConnPool>,
+    pool: Arc<http_conn_pool::GlobalConnPool<Send>>,
     conn_info: ConnInfo,
     conn_id: uuid::Uuid,
 
@@ -491,7 +492,7 @@ struct HyperMechanism {
 
 #[async_trait]
 impl ConnectMechanism for HyperMechanism {
-    type Connection = http_conn_pool::Client;
+    type Connection = http_conn_pool::Client<Send>;
     type ConnectError = HttpConnError;
     type Error = HttpConnError;
 
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index aa869ff1c0..b97c656510 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -1,31 +1,29 @@
-use std::collections::HashMap;
 use std::fmt;
-use std::ops::Deref;
 use std::pin::pin;
-use std::sync::atomic::{self, AtomicUsize};
 use std::sync::{Arc, Weak};
 use std::task::{ready, Poll};
-use std::time::Duration;
 
-use dashmap::DashMap;
 use futures::future::poll_fn;
 use futures::Future;
-use parking_lot::RwLock;
-use rand::Rng;
 use smallvec::SmallVec;
 use tokio::time::Instant;
 use tokio_postgres::tls::NoTlsStream;
-use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
+use tokio_postgres::{AsyncMessage, Socket};
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, info_span, warn, Instrument, Span};
+use tracing::{error, info, info_span, warn, Instrument};
 
-use super::backend::HttpConnError;
-use crate::auth::backend::ComputeUserInfo;
 use crate::context::RequestMonitoring;
-use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
-use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
-use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
-use crate::{DbName, EndpointCacheKey, RoleName};
+use crate::control_plane::messages::MetricsAuxInfo;
+use crate::metrics::Metrics;
+
+use super::conn_pool_lib::{Client, ClientInnerExt, ConnInfo, GlobalConnPool};
+
+#[cfg(test)]
+use {
+    super::conn_pool_lib::GlobalConnPoolOptions,
+    crate::auth::backend::ComputeUserInfo,
+    std::{sync::atomic, time::Duration},
+};
 
 #[derive(Debug, Clone)]
 pub(crate) struct ConnInfoWithAuth {
@@ -33,34 +31,12 @@ pub(crate) struct ConnInfoWithAuth {
     pub(crate) auth: AuthData,
 }
 
-#[derive(Debug, Clone)]
-pub(crate) struct ConnInfo {
-    pub(crate) user_info: ComputeUserInfo,
-    pub(crate) dbname: DbName,
-}
-
 #[derive(Debug, Clone)]
 pub(crate) enum AuthData {
     Password(SmallVec<[u8; 16]>),
     Jwt(String),
 }
 
-impl ConnInfo {
-    // hm, change to hasher to avoid cloning?
-    pub(crate) fn db_and_user(&self) -> (DbName, RoleName) {
-        (self.dbname.clone(), self.user_info.user.clone())
-    }
-
-    pub(crate) fn endpoint_cache_key(&self) -> Option<EndpointCacheKey> {
-        // We don't want to cache http connections for ephemeral endpoints.
-        if self.user_info.options.is_ephemeral() {
-            None
-        } else {
-            Some(self.user_info.endpoint_cache_key())
-        }
-    }
-}
-
 impl fmt::Display for ConnInfo {
     // use custom display to avoid logging password
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
@@ -75,402 +51,6 @@ impl fmt::Display for ConnInfo {
     }
 }
 
-struct ConnPoolEntry<C: ClientInnerExt> {
-    conn: ClientInner<C>,
-    _last_access: std::time::Instant,
-}
-
-// Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
-// Number of open connections is limited by the `max_conns_per_endpoint`.
-pub(crate) struct EndpointConnPool<C: ClientInnerExt> {
-    pools: HashMap<(DbName, RoleName), DbUserConnPool<C>>,
-    total_conns: usize,
-    max_conns: usize,
-    _guard: HttpEndpointPoolsGuard<'static>,
-    global_connections_count: Arc<AtomicUsize>,
-    global_pool_size_max_conns: usize,
-}
-
-impl<C: ClientInnerExt> EndpointConnPool<C> {
-    fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option<ConnPoolEntry<C>> {
-        let Self {
-            pools,
-            total_conns,
-            global_connections_count,
-            ..
-        } = self;
-        pools.get_mut(&db_user).and_then(|pool_entries| {
-            pool_entries.get_conn_entry(total_conns, global_connections_count.clone())
-        })
-    }
-
-    fn remove_client(&mut self, db_user: (DbName, RoleName), conn_id: uuid::Uuid) -> bool {
-        let Self {
-            pools,
-            total_conns,
-            global_connections_count,
-            ..
-        } = self;
-        if let Some(pool) = pools.get_mut(&db_user) {
-            let old_len = pool.conns.len();
-            pool.conns.retain(|conn| conn.conn.conn_id != conn_id);
-            let new_len = pool.conns.len();
-            let removed = old_len - new_len;
-            if removed > 0 {
-                global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
-                Metrics::get()
-                    .proxy
-                    .http_pool_opened_connections
-                    .get_metric()
-                    .dec_by(removed as i64);
-            }
-            *total_conns -= removed;
-            removed > 0
-        } else {
-            false
-        }
-    }
-
-    fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInner<C>) {
-        let conn_id = client.conn_id;
-
-        if client.is_closed() {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
-            return;
-        }
-        let global_max_conn = pool.read().global_pool_size_max_conns;
-        if pool
-            .read()
-            .global_connections_count
-            .load(atomic::Ordering::Relaxed)
-            >= global_max_conn
-        {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full");
-            return;
-        }
-
-        // return connection to the pool
-        let mut returned = false;
-        let mut per_db_size = 0;
-        let total_conns = {
-            let mut pool = pool.write();
-
-            if pool.total_conns < pool.max_conns {
-                let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default();
-                pool_entries.conns.push(ConnPoolEntry {
-                    conn: client,
-                    _last_access: std::time::Instant::now(),
-                });
-
-                returned = true;
-                per_db_size = pool_entries.conns.len();
-
-                pool.total_conns += 1;
-                pool.global_connections_count
-                    .fetch_add(1, atomic::Ordering::Relaxed);
-                Metrics::get()
-                    .proxy
-                    .http_pool_opened_connections
-                    .get_metric()
-                    .inc();
-            }
-
-            pool.total_conns
-        };
-
-        // do logging outside of the mutex
-        if returned {
-            info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
-        } else {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
-        }
-    }
-}
-
-impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
-    fn drop(&mut self) {
-        if self.total_conns > 0 {
-            self.global_connections_count
-                .fetch_sub(self.total_conns, atomic::Ordering::Relaxed);
-            Metrics::get()
-                .proxy
-                .http_pool_opened_connections
-                .get_metric()
-                .dec_by(self.total_conns as i64);
-        }
-    }
-}
-
-pub(crate) struct DbUserConnPool<C: ClientInnerExt> {
-    conns: Vec<ConnPoolEntry<C>>,
-}
-
-impl<C: ClientInnerExt> Default for DbUserConnPool<C> {
-    fn default() -> Self {
-        Self { conns: Vec::new() }
-    }
-}
-
-impl<C: ClientInnerExt> DbUserConnPool<C> {
-    fn clear_closed_clients(&mut self, conns: &mut usize) -> usize {
-        let old_len = self.conns.len();
-
-        self.conns.retain(|conn| !conn.conn.is_closed());
-
-        let new_len = self.conns.len();
-        let removed = old_len - new_len;
-        *conns -= removed;
-        removed
-    }
-
-    fn get_conn_entry(
-        &mut self,
-        conns: &mut usize,
-        global_connections_count: Arc<AtomicUsize>,
-    ) -> Option<ConnPoolEntry<C>> {
-        let mut removed = self.clear_closed_clients(conns);
-        let conn = self.conns.pop();
-        if conn.is_some() {
-            *conns -= 1;
-            removed += 1;
-        }
-        global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
-        Metrics::get()
-            .proxy
-            .http_pool_opened_connections
-            .get_metric()
-            .dec_by(removed as i64);
-        conn
-    }
-}
-
-pub(crate) struct GlobalConnPool<C: ClientInnerExt> {
-    // endpoint -> per-endpoint connection pool
-    //
-    // That should be a fairly conteded map, so return reference to the per-endpoint
-    // pool as early as possible and release the lock.
-    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool<C>>>>,
-
-    /// Number of endpoint-connection pools
-    ///
-    /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
-    /// That seems like far too much effort, so we're using a relaxed increment counter instead.
-    /// It's only used for diagnostics.
-    global_pool_size: AtomicUsize,
-
-    /// Total number of connections in the pool
-    global_connections_count: Arc<AtomicUsize>,
-
-    config: &'static crate::config::HttpConfig,
-}
-
-#[derive(Debug, Clone, Copy)]
-pub struct GlobalConnPoolOptions {
-    // Maximum number of connections per one endpoint.
-    // Can mix different (dbname, username) connections.
-    // When running out of free slots for a particular endpoint,
-    // falls back to opening a new connection for each request.
-    pub max_conns_per_endpoint: usize,
-
-    pub gc_epoch: Duration,
-
-    pub pool_shards: usize,
-
-    pub idle_timeout: Duration,
-
-    pub opt_in: bool,
-
-    // Total number of connections in the pool.
-    pub max_total_conns: usize,
-}
-
-impl<C: ClientInnerExt> GlobalConnPool<C> {
-    pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
-        let shards = config.pool_options.pool_shards;
-        Arc::new(Self {
-            global_pool: DashMap::with_shard_amount(shards),
-            global_pool_size: AtomicUsize::new(0),
-            config,
-            global_connections_count: Arc::new(AtomicUsize::new(0)),
-        })
-    }
-
-    #[cfg(test)]
-    pub(crate) fn get_global_connections_count(&self) -> usize {
-        self.global_connections_count
-            .load(atomic::Ordering::Relaxed)
-    }
-
-    pub(crate) fn get_idle_timeout(&self) -> Duration {
-        self.config.pool_options.idle_timeout
-    }
-
-    pub(crate) fn shutdown(&self) {
-        // drops all strong references to endpoint-pools
-        self.global_pool.clear();
-    }
-
-    pub(crate) async fn gc_worker(&self, mut rng: impl Rng) {
-        let epoch = self.config.pool_options.gc_epoch;
-        let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32);
-        loop {
-            interval.tick().await;
-
-            let shard = rng.gen_range(0..self.global_pool.shards().len());
-            self.gc(shard);
-        }
-    }
-
-    fn gc(&self, shard: usize) {
-        debug!(shard, "pool: performing epoch reclamation");
-
-        // acquire a random shard lock
-        let mut shard = self.global_pool.shards()[shard].write();
-
-        let timer = Metrics::get()
-            .proxy
-            .http_pool_reclaimation_lag_seconds
-            .start_timer();
-        let current_len = shard.len();
-        let mut clients_removed = 0;
-        shard.retain(|endpoint, x| {
-            // if the current endpoint pool is unique (no other strong or weak references)
-            // then it is currently not in use by any connections.
-            if let Some(pool) = Arc::get_mut(x.get_mut()) {
-                let EndpointConnPool {
-                    pools, total_conns, ..
-                } = pool.get_mut();
-
-                // ensure that closed clients are removed
-                for db_pool in pools.values_mut() {
-                    clients_removed += db_pool.clear_closed_clients(total_conns);
-                }
-
-                // we only remove this pool if it has no active connections
-                if *total_conns == 0 {
-                    info!("pool: discarding pool for endpoint {endpoint}");
-                    return false;
-                }
-            }
-
-            true
-        });
-
-        let new_len = shard.len();
-        drop(shard);
-        timer.observe();
-
-        // Do logging outside of the lock.
-        if clients_removed > 0 {
-            let size = self
-                .global_connections_count
-                .fetch_sub(clients_removed, atomic::Ordering::Relaxed)
-                - clients_removed;
-            Metrics::get()
-                .proxy
-                .http_pool_opened_connections
-                .get_metric()
-                .dec_by(clients_removed as i64);
-            info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}");
-        }
-        let removed = current_len - new_len;
-
-        if removed > 0 {
-            let global_pool_size = self
-                .global_pool_size
-                .fetch_sub(removed, atomic::Ordering::Relaxed)
-                - removed;
-            info!("pool: performed global pool gc. size now {global_pool_size}");
-        }
-    }
-
-    pub(crate) fn get(
-        self: &Arc<Self>,
-        ctx: &RequestMonitoring,
-        conn_info: &ConnInfo,
-    ) -> Result<Option<Client<C>>, HttpConnError> {
-        let mut client: Option<ClientInner<C>> = None;
-        let Some(endpoint) = conn_info.endpoint_cache_key() else {
-            return Ok(None);
-        };
-
-        let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint);
-        if let Some(entry) = endpoint_pool
-            .write()
-            .get_conn_entry(conn_info.db_and_user())
-        {
-            client = Some(entry.conn);
-        }
-        let endpoint_pool = Arc::downgrade(&endpoint_pool);
-
-        // ok return cached connection if found and establish a new one otherwise
-        if let Some(client) = client {
-            if client.is_closed() {
-                info!("pool: cached connection '{conn_info}' is closed, opening a new one");
-                return Ok(None);
-            }
-            tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
-            tracing::Span::current().record(
-                "pid",
-                tracing::field::display(client.inner.get_process_id()),
-            );
-            info!(
-                cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
-                "pool: reusing connection '{conn_info}'"
-            );
-            client.session.send(ctx.session_id())?;
-            ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
-            ctx.success();
-            return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
-        }
-        Ok(None)
-    }
-
-    fn get_or_create_endpoint_pool(
-        self: &Arc<Self>,
-        endpoint: &EndpointCacheKey,
-    ) -> Arc<RwLock<EndpointConnPool<C>>> {
-        // fast path
-        if let Some(pool) = self.global_pool.get(endpoint) {
-            return pool.clone();
-        }
-
-        // slow path
-        let new_pool = Arc::new(RwLock::new(EndpointConnPool {
-            pools: HashMap::new(),
-            total_conns: 0,
-            max_conns: self.config.pool_options.max_conns_per_endpoint,
-            _guard: Metrics::get().proxy.http_endpoint_pools.guard(),
-            global_connections_count: self.global_connections_count.clone(),
-            global_pool_size_max_conns: self.config.pool_options.max_total_conns,
-        }));
-
-        // find or create a pool for this endpoint
-        let mut created = false;
-        let pool = self
-            .global_pool
-            .entry(endpoint.clone())
-            .or_insert_with(|| {
-                created = true;
-                new_pool
-            })
-            .clone();
-
-        // log new global pool size
-        if created {
-            let global_pool_size = self
-                .global_pool_size
-                .fetch_add(1, atomic::Ordering::Relaxed)
-                + 1;
-            info!(
-                "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}"
-            );
-        }
-
-        pool
-    }
-}
-
 pub(crate) fn poll_client<C: ClientInnerExt>(
     global_pool: Arc<GlobalConnPool<C>>,
     ctx: &RequestMonitoring,
@@ -574,7 +154,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
 
     }
     .instrument(span));
-    let inner = ClientInner {
+    let inner = ClientInnerRemote {
         inner: client,
         session: tx,
         cancel,
@@ -584,7 +164,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
     Client::new(inner, conn_info, pool_clone)
 }
 
-struct ClientInner<C: ClientInnerExt> {
+pub(crate) struct ClientInnerRemote<C: ClientInnerExt> {
     inner: C,
     session: tokio::sync::watch::Sender<uuid::Uuid>,
     cancel: CancellationToken,
@@ -592,131 +172,36 @@ struct ClientInner<C: ClientInnerExt> {
     conn_id: uuid::Uuid,
 }
 
-impl<C: ClientInnerExt> Drop for ClientInner<C> {
-    fn drop(&mut self) {
-        // on client drop, tell the conn to shut down
-        self.cancel.cancel();
+impl<C: ClientInnerExt> ClientInnerRemote<C> {
+    pub(crate) fn inner_mut(&mut self) -> &mut C {
+        &mut self.inner
     }
-}
 
-pub(crate) trait ClientInnerExt: Sync + Send + 'static {
-    fn is_closed(&self) -> bool;
-    fn get_process_id(&self) -> i32;
-}
-
-impl ClientInnerExt for tokio_postgres::Client {
-    fn is_closed(&self) -> bool {
-        self.is_closed()
+    pub(crate) fn inner(&self) -> &C {
+        &self.inner
+    }
+
+    pub(crate) fn session(&mut self) -> &mut tokio::sync::watch::Sender<uuid::Uuid> {
+        &mut self.session
+    }
+
+    pub(crate) fn aux(&self) -> &MetricsAuxInfo {
+        &self.aux
+    }
+
+    pub(crate) fn get_conn_id(&self) -> uuid::Uuid {
+        self.conn_id
     }
-    fn get_process_id(&self) -> i32 {
-        self.get_process_id()
-    }
-}
 
-impl<C: ClientInnerExt> ClientInner<C> {
     pub(crate) fn is_closed(&self) -> bool {
         self.inner.is_closed()
     }
 }
 
-impl<C: ClientInnerExt> Client<C> {
-    pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
-        let aux = &self.inner.as_ref().unwrap().aux;
-        USAGE_METRICS.register(Ids {
-            endpoint_id: aux.endpoint_id,
-            branch_id: aux.branch_id,
-        })
-    }
-}
-
-pub(crate) struct Client<C: ClientInnerExt> {
-    span: Span,
-    inner: Option<ClientInner<C>>,
-    conn_info: ConnInfo,
-    pool: Weak<RwLock<EndpointConnPool<C>>>,
-}
-
-pub(crate) struct Discard<'a, C: ClientInnerExt> {
-    conn_info: &'a ConnInfo,
-    pool: &'a mut Weak<RwLock<EndpointConnPool<C>>>,
-}
-
-impl<C: ClientInnerExt> Client<C> {
-    pub(self) fn new(
-        inner: ClientInner<C>,
-        conn_info: ConnInfo,
-        pool: Weak<RwLock<EndpointConnPool<C>>>,
-    ) -> Self {
-        Self {
-            inner: Some(inner),
-            span: Span::current(),
-            conn_info,
-            pool,
-        }
-    }
-    pub(crate) fn inner(&mut self) -> (&mut C, Discard<'_, C>) {
-        let Self {
-            inner,
-            pool,
-            conn_info,
-            span: _,
-        } = self;
-        let inner = inner.as_mut().expect("client inner should not be removed");
-        (&mut inner.inner, Discard { conn_info, pool })
-    }
-}
-
-impl<C: ClientInnerExt> Discard<'_, C> {
-    pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) {
-        let conn_info = &self.conn_info;
-        if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
-            info!("pool: throwing away connection '{conn_info}' because connection is not idle");
-        }
-    }
-    pub(crate) fn discard(&mut self) {
-        let conn_info = &self.conn_info;
-        if std::mem::take(self.pool).strong_count() > 0 {
-            info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state");
-        }
-    }
-}
-
-impl<C: ClientInnerExt> Deref for Client<C> {
-    type Target = C;
-
-    fn deref(&self) -> &Self::Target {
-        &self
-            .inner
-            .as_ref()
-            .expect("client inner should not be removed")
-            .inner
-    }
-}
-
-impl<C: ClientInnerExt> Client<C> {
-    fn do_drop(&mut self) -> Option<impl FnOnce()> {
-        let conn_info = self.conn_info.clone();
-        let client = self
-            .inner
-            .take()
-            .expect("client inner should not be removed");
-        if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() {
-            let current_span = self.span.clone();
-            // return connection to the pool
-            return Some(move || {
-                let _span = current_span.enter();
-                EndpointConnPool::put(&conn_pool, &conn_info, client);
-            });
-        }
-        None
-    }
-}
-
-impl<C: ClientInnerExt> Drop for Client<C> {
+impl<C: ClientInnerExt> Drop for ClientInnerRemote<C> {
     fn drop(&mut self) {
-        if let Some(drop) = self.do_drop() {
-            tokio::task::spawn_blocking(drop);
-        }
+        // on client drop, tell the conn to shut down
+        self.cancel.cancel();
     }
 }
 
@@ -745,12 +230,12 @@ mod tests {
         }
     }
 
-    fn create_inner() -> ClientInner<MockClient> {
+    fn create_inner() -> ClientInnerRemote<MockClient> {
         create_inner_with(MockClient::new(false))
     }
 
-    fn create_inner_with(client: MockClient) -> ClientInner<MockClient> {
-        ClientInner {
+    fn create_inner_with(client: MockClient) -> ClientInnerRemote<MockClient> {
+        ClientInnerRemote {
             inner: client,
             session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()),
             cancel: CancellationToken::new(),
@@ -797,7 +282,7 @@ mod tests {
         {
             let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
             assert_eq!(0, pool.get_global_connections_count());
-            client.inner().1.discard();
+            client.inner_mut().1.discard();
             // Discard should not add the connection from the pool.
             assert_eq!(0, pool.get_global_connections_count());
         }
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
new file mode 100644
index 0000000000..6e964ce878
--- /dev/null
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -0,0 +1,562 @@
+use dashmap::DashMap;
+use parking_lot::RwLock;
+use rand::Rng;
+use std::{collections::HashMap, sync::Arc, sync::Weak, time::Duration};
+use std::{
+    ops::Deref,
+    sync::atomic::{self, AtomicUsize},
+};
+use tokio_postgres::ReadyForQueryStatus;
+
+use crate::control_plane::messages::ColdStartInfo;
+use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
+use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
+use crate::{
+    auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName,
+};
+
+use super::conn_pool::ClientInnerRemote;
+use tracing::info;
+use tracing::{debug, Span};
+
+use super::backend::HttpConnError;
+
+#[derive(Debug, Clone)]
+pub(crate) struct ConnInfo {
+    pub(crate) user_info: ComputeUserInfo,
+    pub(crate) dbname: DbName,
+}
+
+impl ConnInfo {
+    // hm, change to hasher to avoid cloning?
+    pub(crate) fn db_and_user(&self) -> (DbName, RoleName) {
+        (self.dbname.clone(), self.user_info.user.clone())
+    }
+
+    pub(crate) fn endpoint_cache_key(&self) -> Option<EndpointCacheKey> {
+        // We don't want to cache http connections for ephemeral endpoints.
+        if self.user_info.options.is_ephemeral() {
+            None
+        } else {
+            Some(self.user_info.endpoint_cache_key())
+        }
+    }
+}
+
+pub(crate) struct ConnPoolEntry<C: ClientInnerExt> {
+    pub(crate) conn: ClientInnerRemote<C>,
+    pub(crate) _last_access: std::time::Instant,
+}
+
+// Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
+// Number of open connections is limited by the `max_conns_per_endpoint`.
+pub(crate) struct EndpointConnPool<C: ClientInnerExt> {
+    pools: HashMap<(DbName, RoleName), DbUserConnPool<C>>,
+    total_conns: usize,
+    max_conns: usize,
+    _guard: HttpEndpointPoolsGuard<'static>,
+    global_connections_count: Arc<AtomicUsize>,
+    global_pool_size_max_conns: usize,
+}
+
+impl<C: ClientInnerExt> EndpointConnPool<C> {
+    fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option<ConnPoolEntry<C>> {
+        let Self {
+            pools,
+            total_conns,
+            global_connections_count,
+            ..
+        } = self;
+        pools.get_mut(&db_user).and_then(|pool_entries| {
+            let (entry, removed) = pool_entries.get_conn_entry(total_conns);
+            global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
+            entry
+        })
+    }
+
+    pub(crate) fn remove_client(
+        &mut self,
+        db_user: (DbName, RoleName),
+        conn_id: uuid::Uuid,
+    ) -> bool {
+        let Self {
+            pools,
+            total_conns,
+            global_connections_count,
+            ..
+        } = self;
+        if let Some(pool) = pools.get_mut(&db_user) {
+            let old_len = pool.conns.len();
+            pool.conns.retain(|conn| conn.conn.get_conn_id() != conn_id);
+            let new_len = pool.conns.len();
+            let removed = old_len - new_len;
+            if removed > 0 {
+                global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
+                Metrics::get()
+                    .proxy
+                    .http_pool_opened_connections
+                    .get_metric()
+                    .dec_by(removed as i64);
+            }
+            *total_conns -= removed;
+            removed > 0
+        } else {
+            false
+        }
+    }
+
+    pub(crate) fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInnerRemote<C>) {
+        let conn_id = client.get_conn_id();
+
+        if client.is_closed() {
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
+            return;
+        }
+
+        let global_max_conn = pool.read().global_pool_size_max_conns;
+        if pool
+            .read()
+            .global_connections_count
+            .load(atomic::Ordering::Relaxed)
+            >= global_max_conn
+        {
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full");
+            return;
+        }
+
+        // return connection to the pool
+        let mut returned = false;
+        let mut per_db_size = 0;
+        let total_conns = {
+            let mut pool = pool.write();
+
+            if pool.total_conns < pool.max_conns {
+                let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default();
+                pool_entries.conns.push(ConnPoolEntry {
+                    conn: client,
+                    _last_access: std::time::Instant::now(),
+                });
+
+                returned = true;
+                per_db_size = pool_entries.conns.len();
+
+                pool.total_conns += 1;
+                pool.global_connections_count
+                    .fetch_add(1, atomic::Ordering::Relaxed);
+                Metrics::get()
+                    .proxy
+                    .http_pool_opened_connections
+                    .get_metric()
+                    .inc();
+            }
+
+            pool.total_conns
+        };
+
+        // do logging outside of the mutex
+        if returned {
+            info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
+        } else {
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
+        }
+    }
+}
+
+impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
+    fn drop(&mut self) {
+        if self.total_conns > 0 {
+            self.global_connections_count
+                .fetch_sub(self.total_conns, atomic::Ordering::Relaxed);
+            Metrics::get()
+                .proxy
+                .http_pool_opened_connections
+                .get_metric()
+                .dec_by(self.total_conns as i64);
+        }
+    }
+}
+
+pub(crate) struct DbUserConnPool<C: ClientInnerExt> {
+    pub(crate) conns: Vec<ConnPoolEntry<C>>,
+}
+
+impl<C: ClientInnerExt> Default for DbUserConnPool<C> {
+    fn default() -> Self {
+        Self { conns: Vec::new() }
+    }
+}
+
+impl<C: ClientInnerExt> DbUserConnPool<C> {
+    fn clear_closed_clients(&mut self, conns: &mut usize) -> usize {
+        let old_len = self.conns.len();
+
+        self.conns.retain(|conn| !conn.conn.is_closed());
+
+        let new_len = self.conns.len();
+        let removed = old_len - new_len;
+        *conns -= removed;
+        removed
+    }
+
+    pub(crate) fn get_conn_entry(
+        &mut self,
+        conns: &mut usize,
+    ) -> (Option<ConnPoolEntry<C>>, usize) {
+        let mut removed = self.clear_closed_clients(conns);
+        let conn = self.conns.pop();
+        if conn.is_some() {
+            *conns -= 1;
+            removed += 1;
+        }
+
+        Metrics::get()
+            .proxy
+            .http_pool_opened_connections
+            .get_metric()
+            .dec_by(removed as i64);
+
+        (conn, removed)
+    }
+}
+
+pub(crate) struct GlobalConnPool<C: ClientInnerExt> {
+    // endpoint -> per-endpoint connection pool
+    //
+    // That should be a fairly conteded map, so return reference to the per-endpoint
+    // pool as early as possible and release the lock.
+    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool<C>>>>,
+
+    /// Number of endpoint-connection pools
+    ///
+    /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
+    /// That seems like far too much effort, so we're using a relaxed increment counter instead.
+    /// It's only used for diagnostics.
+    global_pool_size: AtomicUsize,
+
+    /// Total number of connections in the pool
+    global_connections_count: Arc<AtomicUsize>,
+
+    config: &'static crate::config::HttpConfig,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct GlobalConnPoolOptions {
+    // Maximum number of connections per one endpoint.
+    // Can mix different (dbname, username) connections.
+    // When running out of free slots for a particular endpoint,
+    // falls back to opening a new connection for each request.
+    pub max_conns_per_endpoint: usize,
+
+    pub gc_epoch: Duration,
+
+    pub pool_shards: usize,
+
+    pub idle_timeout: Duration,
+
+    pub opt_in: bool,
+
+    // Total number of connections in the pool.
+    pub max_total_conns: usize,
+}
+
+impl<C: ClientInnerExt> GlobalConnPool<C> {
+    pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
+        let shards = config.pool_options.pool_shards;
+        Arc::new(Self {
+            global_pool: DashMap::with_shard_amount(shards),
+            global_pool_size: AtomicUsize::new(0),
+            config,
+            global_connections_count: Arc::new(AtomicUsize::new(0)),
+        })
+    }
+
+    #[cfg(test)]
+    pub(crate) fn get_global_connections_count(&self) -> usize {
+        self.global_connections_count
+            .load(atomic::Ordering::Relaxed)
+    }
+
+    pub(crate) fn get_idle_timeout(&self) -> Duration {
+        self.config.pool_options.idle_timeout
+    }
+
+    pub(crate) fn shutdown(&self) {
+        // drops all strong references to endpoint-pools
+        self.global_pool.clear();
+    }
+
+    pub(crate) async fn gc_worker(&self, mut rng: impl Rng) {
+        let epoch = self.config.pool_options.gc_epoch;
+        let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32);
+        loop {
+            interval.tick().await;
+
+            let shard = rng.gen_range(0..self.global_pool.shards().len());
+            self.gc(shard);
+        }
+    }
+
+    pub(crate) fn gc(&self, shard: usize) {
+        debug!(shard, "pool: performing epoch reclamation");
+
+        // acquire a random shard lock
+        let mut shard = self.global_pool.shards()[shard].write();
+
+        let timer = Metrics::get()
+            .proxy
+            .http_pool_reclaimation_lag_seconds
+            .start_timer();
+        let current_len = shard.len();
+        let mut clients_removed = 0;
+        shard.retain(|endpoint, x| {
+            // if the current endpoint pool is unique (no other strong or weak references)
+            // then it is currently not in use by any connections.
+            if let Some(pool) = Arc::get_mut(x.get_mut()) {
+                let EndpointConnPool {
+                    pools, total_conns, ..
+                } = pool.get_mut();
+
+                // ensure that closed clients are removed
+                for db_pool in pools.values_mut() {
+                    clients_removed += db_pool.clear_closed_clients(total_conns);
+                }
+
+                // we only remove this pool if it has no active connections
+                if *total_conns == 0 {
+                    info!("pool: discarding pool for endpoint {endpoint}");
+                    return false;
+                }
+            }
+
+            true
+        });
+
+        let new_len = shard.len();
+        drop(shard);
+        timer.observe();
+
+        // Do logging outside of the lock.
+        if clients_removed > 0 {
+            let size = self
+                .global_connections_count
+                .fetch_sub(clients_removed, atomic::Ordering::Relaxed)
+                - clients_removed;
+            Metrics::get()
+                .proxy
+                .http_pool_opened_connections
+                .get_metric()
+                .dec_by(clients_removed as i64);
+            info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}");
+        }
+        let removed = current_len - new_len;
+
+        if removed > 0 {
+            let global_pool_size = self
+                .global_pool_size
+                .fetch_sub(removed, atomic::Ordering::Relaxed)
+                - removed;
+            info!("pool: performed global pool gc. size now {global_pool_size}");
+        }
+    }
+
+    pub(crate) fn get_or_create_endpoint_pool(
+        self: &Arc<Self>,
+        endpoint: &EndpointCacheKey,
+    ) -> Arc<RwLock<EndpointConnPool<C>>> {
+        // fast path
+        if let Some(pool) = self.global_pool.get(endpoint) {
+            return pool.clone();
+        }
+
+        // slow path
+        let new_pool = Arc::new(RwLock::new(EndpointConnPool {
+            pools: HashMap::new(),
+            total_conns: 0,
+            max_conns: self.config.pool_options.max_conns_per_endpoint,
+            _guard: Metrics::get().proxy.http_endpoint_pools.guard(),
+            global_connections_count: self.global_connections_count.clone(),
+            global_pool_size_max_conns: self.config.pool_options.max_total_conns,
+        }));
+
+        // find or create a pool for this endpoint
+        let mut created = false;
+        let pool = self
+            .global_pool
+            .entry(endpoint.clone())
+            .or_insert_with(|| {
+                created = true;
+                new_pool
+            })
+            .clone();
+
+        // log new global pool size
+        if created {
+            let global_pool_size = self
+                .global_pool_size
+                .fetch_add(1, atomic::Ordering::Relaxed)
+                + 1;
+            info!(
+                "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}"
+            );
+        }
+
+        pool
+    }
+
+    pub(crate) fn get(
+        self: &Arc<Self>,
+        ctx: &RequestMonitoring,
+        conn_info: &ConnInfo,
+    ) -> Result<Option<Client<C>>, HttpConnError> {
+        let mut client: Option<ClientInnerRemote<C>> = None;
+        let Some(endpoint) = conn_info.endpoint_cache_key() else {
+            return Ok(None);
+        };
+
+        let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint);
+        if let Some(entry) = endpoint_pool
+            .write()
+            .get_conn_entry(conn_info.db_and_user())
+        {
+            client = Some(entry.conn);
+        }
+        let endpoint_pool = Arc::downgrade(&endpoint_pool);
+
+        // ok return cached connection if found and establish a new one otherwise
+        if let Some(mut client) = client {
+            if client.is_closed() {
+                info!("pool: cached connection '{conn_info}' is closed, opening a new one");
+                return Ok(None);
+            }
+            tracing::Span::current()
+                .record("conn_id", tracing::field::display(client.get_conn_id()));
+            tracing::Span::current().record(
+                "pid",
+                tracing::field::display(client.inner().get_process_id()),
+            );
+            info!(
+                cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
+                "pool: reusing connection '{conn_info}'"
+            );
+
+            client.session().send(ctx.session_id())?;
+            ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
+            ctx.success();
+            return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
+        }
+        Ok(None)
+    }
+}
+
+impl<C: ClientInnerExt> Client<C> {
+    pub(crate) fn new(
+        inner: ClientInnerRemote<C>,
+        conn_info: ConnInfo,
+        pool: Weak<RwLock<EndpointConnPool<C>>>,
+    ) -> Self {
+        Self {
+            inner: Some(inner),
+            span: Span::current(),
+            conn_info,
+            pool,
+        }
+    }
+
+    pub(crate) fn inner_mut(&mut self) -> (&mut C, Discard<'_, C>) {
+        let Self {
+            inner,
+            pool,
+            conn_info,
+            span: _,
+        } = self;
+        let inner = inner.as_mut().expect("client inner should not be removed");
+        let inner_ref = inner.inner_mut();
+        (inner_ref, Discard { conn_info, pool })
+    }
+
+    pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
+        let aux = &self.inner.as_ref().unwrap().aux();
+        USAGE_METRICS.register(Ids {
+            endpoint_id: aux.endpoint_id,
+            branch_id: aux.branch_id,
+        })
+    }
+
+    pub(crate) fn do_drop(&mut self) -> Option<impl FnOnce()> {
+        let conn_info = self.conn_info.clone();
+        let client = self
+            .inner
+            .take()
+            .expect("client inner should not be removed");
+        if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() {
+            let current_span = self.span.clone();
+            // return connection to the pool
+            return Some(move || {
+                let _span = current_span.enter();
+                EndpointConnPool::put(&conn_pool, &conn_info, client);
+            });
+        }
+        None
+    }
+}
+
+pub(crate) struct Client<C: ClientInnerExt> {
+    span: Span,
+    inner: Option<ClientInnerRemote<C>>,
+    conn_info: ConnInfo,
+    pool: Weak<RwLock<EndpointConnPool<C>>>,
+}
+
+impl<C: ClientInnerExt> Drop for Client<C> {
+    fn drop(&mut self) {
+        if let Some(drop) = self.do_drop() {
+            tokio::task::spawn_blocking(drop);
+        }
+    }
+}
+
+impl<C: ClientInnerExt> Deref for Client<C> {
+    type Target = C;
+
+    fn deref(&self) -> &Self::Target {
+        self.inner
+            .as_ref()
+            .expect("client inner should not be removed")
+            .inner()
+    }
+}
+
+pub(crate) trait ClientInnerExt: Sync + Send + 'static {
+    fn is_closed(&self) -> bool;
+    fn get_process_id(&self) -> i32;
+}
+
+impl ClientInnerExt for tokio_postgres::Client {
+    fn is_closed(&self) -> bool {
+        self.is_closed()
+    }
+
+    fn get_process_id(&self) -> i32 {
+        self.get_process_id()
+    }
+}
+
+pub(crate) struct Discard<'a, C: ClientInnerExt> {
+    conn_info: &'a ConnInfo,
+    pool: &'a mut Weak<RwLock<EndpointConnPool<C>>>,
+}
+
+impl<C: ClientInnerExt> Discard<'_, C> {
+    pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) {
+        let conn_info = &self.conn_info;
+        if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
+            info!("pool: throwing away connection '{conn_info}' because connection is not idle");
+        }
+    }
+    pub(crate) fn discard(&mut self) {
+        let conn_info = &self.conn_info;
+        if std::mem::take(self.pool).strong_count() > 0 {
+            info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state");
+        }
+    }
+}
diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index 9b6bc98557..79bb19328f 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -10,11 +10,12 @@ use rand::Rng;
 use tokio::net::TcpStream;
 use tracing::{debug, error, info, info_span, Instrument};
 
-use super::conn_pool::ConnInfo;
 use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
+
+use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
 use crate::EndpointCacheKey;
 
 pub(crate) type Send = http2::SendRequest<hyper::body::Incoming>;
@@ -22,15 +23,15 @@ pub(crate) type Connect =
     http2::Connection<TokioIo<TcpStream>, hyper::body::Incoming, TokioExecutor>;
 
 #[derive(Clone)]
-struct ConnPoolEntry {
-    conn: Send,
+pub(crate) struct ConnPoolEntry<C: ClientInnerExt + Clone> {
+    conn: C,
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
 }
 
 // Per-endpoint connection pool
 // Number of open connections is limited by the `max_conns_per_endpoint`.
-pub(crate) struct EndpointConnPool {
+pub(crate) struct EndpointConnPool<C: ClientInnerExt + Clone> {
     // TODO(conrad):
     // either we should open more connections depending on stream count
     // (not exposed by hyper, need our own counter)
@@ -40,13 +41,13 @@ pub(crate) struct EndpointConnPool {
     // seems somewhat redundant though.
     //
     // Probably we should run a semaphore and just the single conn. TBD.
-    conns: VecDeque<ConnPoolEntry>,
+    conns: VecDeque<ConnPoolEntry<C>>,
     _guard: HttpEndpointPoolsGuard<'static>,
     global_connections_count: Arc<AtomicUsize>,
 }
 
-impl EndpointConnPool {
-    fn get_conn_entry(&mut self) -> Option<ConnPoolEntry> {
+impl<C: ClientInnerExt + Clone> EndpointConnPool<C> {
+    fn get_conn_entry(&mut self) -> Option<ConnPoolEntry<C>> {
         let Self { conns, .. } = self;
 
         loop {
@@ -81,7 +82,7 @@ impl EndpointConnPool {
     }
 }
 
-impl Drop for EndpointConnPool {
+impl<C: ClientInnerExt + Clone> Drop for EndpointConnPool<C> {
     fn drop(&mut self) {
         if !self.conns.is_empty() {
             self.global_connections_count
@@ -95,12 +96,12 @@ impl Drop for EndpointConnPool {
     }
 }
 
-pub(crate) struct GlobalConnPool {
+pub(crate) struct GlobalConnPool<C: ClientInnerExt + Clone> {
     // endpoint -> per-endpoint connection pool
     //
     // That should be a fairly conteded map, so return reference to the per-endpoint
     // pool as early as possible and release the lock.
-    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool>>>,
+    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool<C>>>>,
 
     /// Number of endpoint-connection pools
     ///
@@ -115,7 +116,7 @@ pub(crate) struct GlobalConnPool {
     config: &'static crate::config::HttpConfig,
 }
 
-impl GlobalConnPool {
+impl<C: ClientInnerExt + Clone> GlobalConnPool<C> {
     pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
         let shards = config.pool_options.pool_shards;
         Arc::new(Self {
@@ -210,7 +211,7 @@ impl GlobalConnPool {
         self: &Arc<Self>,
         ctx: &RequestMonitoring,
         conn_info: &ConnInfo,
-    ) -> Option<Client> {
+    ) -> Option<Client<C>> {
         let endpoint = conn_info.endpoint_cache_key()?;
         let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint);
         let client = endpoint_pool.write().get_conn_entry()?;
@@ -228,7 +229,7 @@ impl GlobalConnPool {
     fn get_or_create_endpoint_pool(
         self: &Arc<Self>,
         endpoint: &EndpointCacheKey,
-    ) -> Arc<RwLock<EndpointConnPool>> {
+    ) -> Arc<RwLock<EndpointConnPool<C>>> {
         // fast path
         if let Some(pool) = self.global_pool.get(endpoint) {
             return pool.clone();
@@ -268,14 +269,14 @@ impl GlobalConnPool {
 }
 
 pub(crate) fn poll_http2_client(
-    global_pool: Arc<GlobalConnPool>,
+    global_pool: Arc<GlobalConnPool<Send>>,
     ctx: &RequestMonitoring,
     conn_info: &ConnInfo,
     client: Send,
     connection: Connect,
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
-) -> Client {
+) -> Client<Send> {
     let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol());
     let session_id = ctx.session_id();
 
@@ -322,13 +323,13 @@ pub(crate) fn poll_http2_client(
     Client::new(client, aux)
 }
 
-pub(crate) struct Client {
-    pub(crate) inner: Send,
+pub(crate) struct Client<C: ClientInnerExt + Clone> {
+    pub(crate) inner: C,
     aux: MetricsAuxInfo,
 }
 
-impl Client {
-    pub(self) fn new(inner: Send, aux: MetricsAuxInfo) -> Self {
+impl<C: ClientInnerExt + Clone> Client<C> {
+    pub(self) fn new(inner: C, aux: MetricsAuxInfo) -> Self {
         Self { inner, aux }
     }
 
@@ -339,3 +340,14 @@ impl Client {
         })
     }
 }
+
+impl ClientInnerExt for Send {
+    fn is_closed(&self) -> bool {
+        self.is_closed()
+    }
+
+    fn get_process_id(&self) -> i32 {
+        // ideally throw something meaningful
+        -1
+    }
+}
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index 5df37a8762..c4fdd00f78 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -20,11 +20,12 @@ use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument, Span};
 
 use super::backend::HttpConnError;
-use super::conn_pool::{ClientInnerExt, ConnInfo};
+use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
 use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::Metrics;
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
+
 use crate::{DbName, RoleName};
 
 struct ConnPoolEntry<C: ClientInnerExt> {
@@ -362,7 +363,7 @@ pub(crate) fn poll_client(
     LocalClient::new(inner, conn_info, pool_clone)
 }
 
-struct ClientInner<C: ClientInnerExt> {
+pub(crate) struct ClientInner<C: ClientInnerExt> {
     inner: C,
     session: tokio::sync::watch::Sender<uuid::Uuid>,
     cancel: CancellationToken,
@@ -387,13 +388,24 @@ impl<C: ClientInnerExt> ClientInner<C> {
     }
 }
 
-impl<C: ClientInnerExt> LocalClient<C> {
-    pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
-        let aux = &self.inner.as_ref().unwrap().aux;
-        USAGE_METRICS.register(Ids {
-            endpoint_id: aux.endpoint_id,
-            branch_id: aux.branch_id,
-        })
+impl ClientInner<tokio_postgres::Client> {
+    pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), HttpConnError> {
+        self.jti += 1;
+        let token = resign_jwt(&self.key, payload, self.jti)?;
+
+        // initiates the auth session
+        self.inner.simple_query("discard all").await?;
+        self.inner
+            .query(
+                "select auth.jwt_session_init($1)",
+                &[&token as &(dyn ToSql + Sync)],
+            )
+            .await?;
+
+        let pid = self.inner.get_process_id();
+        info!(pid, jti = self.jti, "user session state init");
+
+        Ok(())
     }
 }
 
@@ -422,6 +434,18 @@ impl<C: ClientInnerExt> LocalClient<C> {
             pool,
         }
     }
+
+    pub(crate) fn client_inner(&mut self) -> (&mut ClientInner<C>, Discard<'_, C>) {
+        let Self {
+            inner,
+            pool,
+            conn_info,
+            span: _,
+        } = self;
+        let inner_m = inner.as_mut().expect("client inner should not be removed");
+        (inner_m, Discard { conn_info, pool })
+    }
+
     pub(crate) fn inner(&mut self) -> (&mut C, Discard<'_, C>) {
         let Self {
             inner,
@@ -434,33 +458,6 @@ impl<C: ClientInnerExt> LocalClient<C> {
     }
 }
 
-impl LocalClient<tokio_postgres::Client> {
-    pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), HttpConnError> {
-        let inner = self
-            .inner
-            .as_mut()
-            .expect("client inner should not be removed");
-
-        inner.jti += 1;
-        let token = resign_jwt(&inner.key, payload, inner.jti)?;
-
-        // initiates the auth session
-        inner.inner.simple_query("discard all").await?;
-        inner
-            .inner
-            .query(
-                "select auth.jwt_session_init($1)",
-                &[&token as &(dyn ToSql + Sync)],
-            )
-            .await?;
-
-        let pid = inner.inner.get_process_id();
-        info!(pid, jti = inner.jti, "user session state init");
-
-        Ok(())
-    }
-}
-
 /// implements relatively efficient in-place json object key upserting
 ///
 /// only supports top-level keys
@@ -524,24 +521,15 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String {
     jwt
 }
 
-impl<C: ClientInnerExt> Discard<'_, C> {
-    pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) {
-        let conn_info = &self.conn_info;
-        if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
-            info!(
-                "local_pool: throwing away connection '{conn_info}' because connection is not idle"
-            );
-        }
-    }
-    pub(crate) fn discard(&mut self) {
-        let conn_info = &self.conn_info;
-        if std::mem::take(self.pool).strong_count() > 0 {
-            info!("local_pool: throwing away connection '{conn_info}' because connection is potentially in a broken state");
-        }
-    }
-}
-
 impl<C: ClientInnerExt> LocalClient<C> {
+    pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
+        let aux = &self.inner.as_ref().unwrap().aux;
+        USAGE_METRICS.register(Ids {
+            endpoint_id: aux.endpoint_id,
+            branch_id: aux.branch_id,
+        })
+    }
+
     fn do_drop(&mut self) -> Option<impl FnOnce()> {
         let conn_info = self.conn_info.clone();
         let client = self
@@ -568,6 +556,23 @@ impl<C: ClientInnerExt> Drop for LocalClient<C> {
     }
 }
 
+impl<C: ClientInnerExt> Discard<'_, C> {
+    pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) {
+        let conn_info = &self.conn_info;
+        if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
+            info!(
+                "local_pool: throwing away connection '{conn_info}' because connection is not idle"
+            );
+        }
+    }
+    pub(crate) fn discard(&mut self) {
+        let conn_info = &self.conn_info;
+        if std::mem::take(self.pool).strong_count() > 0 {
+            info!("local_pool: throwing away connection '{conn_info}' because connection is potentially in a broken state");
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use p256::ecdsa::SigningKey;
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index 3ed3b6c845..29ff7b9d91 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -5,6 +5,7 @@
 mod backend;
 pub mod cancel_set;
 mod conn_pool;
+mod conn_pool_lib;
 mod http_conn_pool;
 mod http_util;
 mod json;
@@ -20,7 +21,7 @@ use anyhow::Context;
 use async_trait::async_trait;
 use atomic_take::AtomicTake;
 use bytes::Bytes;
-pub use conn_pool::GlobalConnPoolOptions;
+pub use conn_pool_lib::GlobalConnPoolOptions;
 use futures::future::{select, Either};
 use futures::TryFutureExt;
 use http::{Method, Response, StatusCode};
@@ -65,7 +66,7 @@ pub async fn task_main(
     }
 
     let local_pool = local_conn_pool::LocalConnPool::new(&config.http_config);
-    let conn_pool = conn_pool::GlobalConnPool::new(&config.http_config);
+    let conn_pool = conn_pool_lib::GlobalConnPool::new(&config.http_config);
     {
         let conn_pool = Arc::clone(&conn_pool);
         tokio::spawn(async move {
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 3d8a2adef1..bb5eb390a6 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -25,10 +25,11 @@ use urlencoding;
 use utils::http::error::ApiError;
 
 use super::backend::{LocalProxyConnError, PoolingBackend};
-use super::conn_pool::{AuthData, ConnInfo, ConnInfoWithAuth};
+use super::conn_pool::{AuthData, ConnInfoWithAuth};
+use super::conn_pool_lib::{self, ConnInfo};
 use super::http_util::json_response;
 use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError};
-use super::{conn_pool, local_conn_pool};
+use super::local_conn_pool;
 use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
 use crate::auth::{endpoint_sni, ComputeUserInfoParseError};
 use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig};
@@ -37,6 +38,7 @@ use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::metrics::{HttpDirection, Metrics};
 use crate::proxy::{run_until_cancelled, NeonOptions};
 use crate::serverless::backend::HttpConnError;
+
 use crate::usage_metrics::{MetricCounter, MetricCounterRecorder};
 use crate::{DbName, RoleName};
 
@@ -607,7 +609,8 @@ async fn handle_db_inner(
             let client = match keys.keys {
                 ComputeCredentialKeys::JwtPayload(payload) if is_local_proxy => {
                     let mut client = backend.connect_to_local_postgres(ctx, conn_info).await?;
-                    client.set_jwt_session(&payload).await?;
+                    let (cli_inner, _dsc) = client.client_inner();
+                    cli_inner.set_jwt_session(&payload).await?;
                     Client::Local(client)
                 }
                 _ => {
@@ -1021,12 +1024,12 @@ async fn query_to_json<T: GenericClient>(
 }
 
 enum Client {
-    Remote(conn_pool::Client<tokio_postgres::Client>),
+    Remote(conn_pool_lib::Client<tokio_postgres::Client>),
     Local(local_conn_pool::LocalClient<tokio_postgres::Client>),
 }
 
 enum Discard<'a> {
-    Remote(conn_pool::Discard<'a, tokio_postgres::Client>),
+    Remote(conn_pool_lib::Discard<'a, tokio_postgres::Client>),
     Local(local_conn_pool::Discard<'a, tokio_postgres::Client>),
 }
 
@@ -1041,7 +1044,7 @@ impl Client {
     fn inner(&mut self) -> (&mut tokio_postgres::Client, Discard<'_>) {
         match self {
             Client::Remote(client) => {
-                let (c, d) = client.inner();
+                let (c, d) = client.inner_mut();
                 (c, Discard::Remote(d))
             }
             Client::Local(local_client) => {

From 35e7d91bc9eb07c8ef70acef5e224c9b9e78a0ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 17 Oct 2024 14:07:58 +0200
Subject: [PATCH 03/48] Add config variable for timeline offloading (#9421)

Adds a configuration variable for timeline offloading support. The added
pageserver-global config option controls whether the pageserver
automatically offloads timelines during compaction.

Therefore, already offloaded timelines are not affected by this, nor is
the manual testing endpoint.

This allows the rollout of timeline offloading to be driven by the
storage team.

Part of #8088
---
 libs/pageserver_api/src/config.rs            | 2 ++
 pageserver/src/config.rs                     | 5 +++++
 pageserver/src/tenant.rs                     | 3 ++-
 pageserver/src/tenant/timeline.rs            | 1 +
 test_runner/regress/test_timeline_archive.py | 4 ++++
 5 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 24474d4840..896a5d8069 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -102,6 +102,7 @@ pub struct ConfigToml {
     pub ingest_batch_size: u64,
     pub max_vectored_read_bytes: MaxVectoredReadBytes,
     pub image_compression: ImageCompressionAlgorithm,
+    pub timeline_offloading: bool,
     pub ephemeral_bytes_per_memory_kb: usize,
     pub l0_flush: Option<crate::models::L0FlushConfig>,
     pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
@@ -385,6 +386,7 @@ impl Default for ConfigToml {
                 NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
             )),
             image_compression: (DEFAULT_IMAGE_COMPRESSION),
+            timeline_offloading: false,
             ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
             l0_flush: None,
             virtual_file_io_mode: None,
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 8db78285e4..06d4326459 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -164,6 +164,9 @@ pub struct PageServerConf {
 
     pub image_compression: ImageCompressionAlgorithm,
 
+    /// Whether to offload archived timelines automatically
+    pub timeline_offloading: bool,
+
     /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM.  When this
     /// is exceeded, we start proactively closing ephemeral layers to limit the total amount
     /// of ephemeral data.
@@ -321,6 +324,7 @@ impl PageServerConf {
             ingest_batch_size,
             max_vectored_read_bytes,
             image_compression,
+            timeline_offloading,
             ephemeral_bytes_per_memory_kb,
             l0_flush,
             virtual_file_io_mode,
@@ -364,6 +368,7 @@ impl PageServerConf {
             ingest_batch_size,
             max_vectored_read_bytes,
             image_compression,
+            timeline_offloading,
             ephemeral_bytes_per_memory_kb,
 
             // ------------------------------------------------------------
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 689982ddd4..baa2365658 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2187,7 +2187,8 @@ impl Tenant {
                             .iter()
                             .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(*timeline_id))
                     };
-                    let can_offload = can_offload && has_no_unoffloaded_children;
+                    let can_offload =
+                        can_offload && has_no_unoffloaded_children && self.conf.timeline_offloading;
                     if (is_active, can_offload) == (false, false) {
                         None
                     } else {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1992dee930..2b4f949c76 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1565,6 +1565,7 @@ impl Timeline {
     }
 
     /// Checks if the internal state of the timeline is consistent with it being able to be offloaded.
+    ///
     /// This is neccessary but not sufficient for offloading of the timeline as it might have
     /// child timelines that are not offloaded yet.
     pub(crate) fn can_offload(&self) -> bool {
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index ffaed5e130..85e1077fd5 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -119,6 +119,10 @@ def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int):
 
 @pytest.mark.parametrize("manual_offload", [False, True])
 def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: bool):
+    if not manual_offload:
+        # (automatic) timeline offloading defaults to false for now
+        neon_env_builder.pageserver_config_override = "timeline_offloading = true"
+
     env = neon_env_builder.init_start()
     ps_http = env.pageserver.http_client()
 

From 8b479381403cd2be8f7bc7eba69d5074735d8924 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 17 Oct 2024 13:37:21 +0100
Subject: [PATCH 04/48] Add support of extensions for v17 (part 3) (#9430)

- pgvector 7.4

update support of extensions for v14-v16:
- pgvector 7.2 -> 7.4
---
 compute/Dockerfile.compute-node | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node
index b0ce7c1718..45c1fd9f38 100644
--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -353,13 +353,10 @@ COPY compute/patches/pgvector.patch /pgvector.patch
 # because we build the images on different machines than where we run them.
 # Pass OPTFLAGS="" to remove it.
 #
-# v17 is not supported yet because of upstream issue
-# https://github.com/pgvector/pgvector/issues/669
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \
-    echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \
+# vector 0.7.4 supports v17
+# last release v0.7.4 - Aug 5, 2024
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.4.tar.gz -O pgvector.tar.gz && \
+    echo "0341edf89b1924ae0d552f617e14fb7f8867c0194ed775bcc44fa40288642583 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
     patch -p1 < /pgvector.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \

From a7c05686ccbebc856b0ce389a9fa60d2bddbeea6 Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Thu, 17 Oct 2024 17:20:42 +0300
Subject: [PATCH 05/48] test_runner: Update the README.md to build neon with
 'testing' (#9437)

Without having the '--features testing' in the cargo build the proxy
won't start causing tests to fail.
---
 test_runner/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/README.md b/test_runner/README.md
index e087241c1f..55d8d2faa9 100644
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -6,7 +6,7 @@ Prerequisites:
 - Correctly configured Python, see [`/docs/sourcetree.md`](/docs/sourcetree.md#using-python)
 - Neon and Postgres binaries
     - See the root [README.md](/README.md) for build directions
-      If you want to test tests with test-only APIs, you would need to add `--features testing` to Rust code build commands.
+      To run tests you need to add `--features testing` to Rust code build commands.
       For convenience, repository cargo config contains `build_testing` alias, that serves as a subcommand, adding the required feature flags.
       Usage example: `cargo build_testing --release` is equivalent to `cargo build --features testing --release`
     - Tests can be run from the git tree; or see the environment variables

From f3a3eefd26284776ab3179116374009ec537ab11 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 17 Oct 2024 10:29:53 -0400
Subject: [PATCH 06/48] feat(pageserver): do space check before gc-compaction
 (#9250)

part of https://github.com/neondatabase/neon/issues/9114

## Summary of changes

gc-compaction may take a lot of disk space, and if it does, the caller
should do a partial gc-compaction. This patch adds space check for the
compaction job.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/disk_usage_eviction_task.rs   | 11 +----
 pageserver/src/statvfs.rs                    | 16 ++++++++
 pageserver/src/tenant/storage_layer/layer.rs |  4 ++
 pageserver/src/tenant/timeline/compaction.rs | 42 ++++++++++++++++++++
 4 files changed, 63 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index a58fa2c0b1..7ab2ba8742 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -1218,16 +1218,7 @@ mod filesystem_level_usage {
         let stat = Statvfs::get(tenants_dir, mock_config)
             .context("statvfs failed, presumably directory got unlinked")?;
 
-        // https://unix.stackexchange.com/a/703650
-        let blocksize = if stat.fragment_size() > 0 {
-            stat.fragment_size()
-        } else {
-            stat.block_size()
-        };
-
-        // use blocks_available (b_avail) since, pageserver runs as unprivileged user
-        let avail_bytes = stat.blocks_available() * blocksize;
-        let total_bytes = stat.blocks() * blocksize;
+        let (avail_bytes, total_bytes) = stat.get_avail_total_bytes();
 
         Ok(Usage {
             config,
diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs
index 5a6f6e5176..205605bc86 100644
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -53,6 +53,22 @@ impl Statvfs {
             Statvfs::Mock(stat) => stat.block_size,
         }
     }
+
+    /// Get the available and total bytes on the filesystem.
+    pub fn get_avail_total_bytes(&self) -> (u64, u64) {
+        // https://unix.stackexchange.com/a/703650
+        let blocksize = if self.fragment_size() > 0 {
+            self.fragment_size()
+        } else {
+            self.block_size()
+        };
+
+        // use blocks_available (b_avail) since, pageserver runs as unprivileged user
+        let avail_bytes = self.blocks_available() * blocksize;
+        let total_bytes = self.blocks() * blocksize;
+
+        (avail_bytes, total_bytes)
+    }
 }
 
 pub mod mock {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index bbb21b180e..f29a33bae6 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -341,6 +341,10 @@ impl Layer {
         Ok(())
     }
 
+    pub(crate) async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
+        self.0.needs_download().await
+    }
+
     /// Assuming the layer is already downloaded, returns a guard which will prohibit eviction
     /// while the guard exists.
     ///
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 8b9ace1e5b..5588363330 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -29,6 +29,7 @@ use utils::id::TimelineId;
 
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
+use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::filter_iterator::FilterIterator;
@@ -1691,6 +1692,45 @@ impl Timeline {
         unreachable!("key retention is empty")
     }
 
+    /// Check how much space is left on the disk
+    async fn check_available_space(self: &Arc<Self>) -> anyhow::Result<u64> {
+        let tenants_dir = self.conf.tenants_path();
+
+        let stat = Statvfs::get(&tenants_dir, None)
+            .context("statvfs failed, presumably directory got unlinked")?;
+
+        let (avail_bytes, _) = stat.get_avail_total_bytes();
+
+        Ok(avail_bytes)
+    }
+
+    /// Check if the compaction can proceed safely without running out of space. We assume the size
+    /// upper bound of the produced files of a compaction job is the same as all layers involved in
+    /// the compaction. Therefore, we need `2 * layers_to_be_compacted_size` at least to do a
+    /// compaction.
+    async fn check_compaction_space(
+        self: &Arc<Self>,
+        layer_selection: &[Layer],
+    ) -> anyhow::Result<()> {
+        let available_space = self.check_available_space().await?;
+        let mut remote_layer_size = 0;
+        let mut all_layer_size = 0;
+        for layer in layer_selection {
+            let needs_download = layer.needs_download().await?;
+            if needs_download.is_some() {
+                remote_layer_size += layer.layer_desc().file_size;
+            }
+            all_layer_size += layer.layer_desc().file_size;
+        }
+        let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */
+        if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space
+        {
+            return Err(anyhow!("not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}",
+                available_space, allocated_space, all_layer_size, remote_layer_size, all_layer_size + remote_layer_size));
+        }
+        Ok(())
+    }
+
     /// An experimental compaction building block that combines compaction with garbage collection.
     ///
     /// The current implementation picks all delta + image layers that are below or intersecting with
@@ -1806,6 +1846,8 @@ impl Timeline {
             lowest_retain_lsn
         );
 
+        self.check_compaction_space(&layer_selection).await?;
+
         // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
         // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
         let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)

From 4c9835f4a3065648c2d6ecd721664b88557aca0f Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 17 Oct 2024 16:34:51 +0200
Subject: [PATCH 07/48] storage_controller: delete stale shards when deleting
 tenant (#9333)

## Problem

Tenant deletion only removes the current shards from remote storage. Any
stale parent shards (before splits) will be left behind. These shards
are kept since child shards may reference data from the parent until new
image layers are generated.

## Summary of changes

* Document a special case for pageserver tenant deletion that deletes
all shards in remote storage when given an unsharded tenant ID, as well
as any unsharded tenant data.
* Pass an unsharded tenant ID to delete all remote storage under the
tenant ID prefix.
* Split out `RemoteStorage::delete_prefix()` to delete a bucket prefix,
with additional test coverage.
* Add a `delimiter` argument to `asset_prefix_empty()` to support
partial prefix matches (i.e. all shards starting with a given tenant
ID).
---
 libs/remote_storage/src/lib.rs            |  53 +++++-
 libs/remote_storage/tests/common/tests.rs | 206 ++++++++++++++++++++++
 pageserver/src/tenant/mgr.rs              |  71 +++-----
 storage_controller/src/service.rs         |  73 ++++----
 test_runner/fixtures/pageserver/utils.py  |  15 +-
 test_runner/regress/test_tenant_delete.py |  55 ++++++
 6 files changed, 376 insertions(+), 97 deletions(-)

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index c6466237bf..719608dd5f 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -19,7 +19,12 @@ mod simulate_failures;
 mod support;
 
 use std::{
-    collections::HashMap, fmt::Debug, num::NonZeroU32, ops::Bound, pin::Pin, sync::Arc,
+    collections::HashMap,
+    fmt::Debug,
+    num::NonZeroU32,
+    ops::Bound,
+    pin::{pin, Pin},
+    sync::Arc,
     time::SystemTime,
 };
 
@@ -28,6 +33,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 
 use bytes::Bytes;
 use futures::{stream::Stream, StreamExt};
+use itertools::Itertools as _;
 use serde::{Deserialize, Serialize};
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
@@ -261,7 +267,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
         max_keys: Option<NonZeroU32>,
         cancel: &CancellationToken,
     ) -> Result<Listing, DownloadError> {
-        let mut stream = std::pin::pin!(self.list_streaming(prefix, mode, max_keys, cancel));
+        let mut stream = pin!(self.list_streaming(prefix, mode, max_keys, cancel));
         let mut combined = stream.next().await.expect("At least one item required")?;
         while let Some(list) = stream.next().await {
             let list = list?;
@@ -324,6 +330,35 @@ pub trait RemoteStorage: Send + Sync + 'static {
         cancel: &CancellationToken,
     ) -> anyhow::Result<()>;
 
+    /// Deletes all objects matching the given prefix.
+    ///
+    /// NB: this uses NoDelimiter and will match partial prefixes. For example, the prefix /a/b will
+    /// delete /a/b, /a/b/*, /a/bc, /a/bc/*, etc.
+    ///
+    /// If the operation fails because of timeout or cancellation, the root cause of the error will
+    /// be set to `TimeoutOrCancel`. In such situation it is unknown which deletions, if any, went
+    /// through.
+    async fn delete_prefix(
+        &self,
+        prefix: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
+        let mut stream =
+            pin!(self.list_streaming(Some(prefix), ListingMode::NoDelimiter, None, cancel));
+        while let Some(result) = stream.next().await {
+            let keys = match result {
+                Ok(listing) if listing.keys.is_empty() => continue,
+                Ok(listing) => listing.keys.into_iter().map(|o| o.key).collect_vec(),
+                Err(DownloadError::Cancelled) => return Err(TimeoutOrCancel::Cancel.into()),
+                Err(DownloadError::Timeout) => return Err(TimeoutOrCancel::Timeout.into()),
+                Err(err) => return Err(err.into()),
+            };
+            tracing::info!("Deleting {} keys from remote storage", keys.len());
+            self.delete_objects(&keys, cancel).await?;
+        }
+        Ok(())
+    }
+
     /// Copy a remote object inside a bucket from one path to another.
     async fn copy(
         &self,
@@ -488,6 +523,20 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         }
     }
 
+    /// See [`RemoteStorage::delete_prefix`]
+    pub async fn delete_prefix(
+        &self,
+        prefix: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
+        match self {
+            Self::LocalFs(s) => s.delete_prefix(prefix, cancel).await,
+            Self::AwsS3(s) => s.delete_prefix(prefix, cancel).await,
+            Self::AzureBlob(s) => s.delete_prefix(prefix, cancel).await,
+            Self::Unreliable(s) => s.delete_prefix(prefix, cancel).await,
+        }
+    }
+
     /// See [`RemoteStorage::copy`]
     pub async fn copy_object(
         &self,
diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs
index e6f33fc3f8..d5da1d48e9 100644
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -199,6 +199,138 @@ async fn list_no_delimiter_works(
     Ok(())
 }
 
+/// Tests that giving a partial prefix returns all matches (e.g. "/foo" yields "/foobar/baz"),
+/// but only with NoDelimiter.
+#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
+#[tokio::test]
+async fn list_partial_prefix(
+    ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
+) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => {
+            anyhow::bail!("S3 init failed: {e:?}")
+        }
+    };
+
+    let cancel = CancellationToken::new();
+    let test_client = Arc::clone(&ctx.enabled.client);
+
+    // Prefix "fold" should match all "folder{i}" directories with NoDelimiter.
+    let objects: HashSet<_> = test_client
+        .list(
+            Some(&RemotePath::from_string("fold")?),
+            ListingMode::NoDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .keys
+        .into_iter()
+        .map(|o| o.key)
+        .collect();
+    assert_eq!(&objects, &ctx.remote_blobs);
+
+    // Prefix "fold" matches nothing with WithDelimiter.
+    let objects: HashSet<_> = test_client
+        .list(
+            Some(&RemotePath::from_string("fold")?),
+            ListingMode::WithDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .keys
+        .into_iter()
+        .map(|o| o.key)
+        .collect();
+    assert!(objects.is_empty());
+
+    // Prefix "" matches everything.
+    let objects: HashSet<_> = test_client
+        .list(
+            Some(&RemotePath::from_string("")?),
+            ListingMode::NoDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .keys
+        .into_iter()
+        .map(|o| o.key)
+        .collect();
+    assert_eq!(&objects, &ctx.remote_blobs);
+
+    // Prefix "" matches nothing with WithDelimiter.
+    let objects: HashSet<_> = test_client
+        .list(
+            Some(&RemotePath::from_string("")?),
+            ListingMode::WithDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .keys
+        .into_iter()
+        .map(|o| o.key)
+        .collect();
+    assert!(objects.is_empty());
+
+    // Prefix "foo" matches nothing.
+    let objects: HashSet<_> = test_client
+        .list(
+            Some(&RemotePath::from_string("foo")?),
+            ListingMode::NoDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .keys
+        .into_iter()
+        .map(|o| o.key)
+        .collect();
+    assert!(objects.is_empty());
+
+    // Prefix "folder2/blob" matches.
+    let objects: HashSet<_> = test_client
+        .list(
+            Some(&RemotePath::from_string("folder2/blob")?),
+            ListingMode::NoDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .keys
+        .into_iter()
+        .map(|o| o.key)
+        .collect();
+    let expect: HashSet<_> = ctx
+        .remote_blobs
+        .iter()
+        .filter(|o| o.get_path().starts_with("folder2"))
+        .cloned()
+        .collect();
+    assert_eq!(&objects, &expect);
+
+    // Prefix "folder2/foo" matches nothing.
+    let objects: HashSet<_> = test_client
+        .list(
+            Some(&RemotePath::from_string("folder2/foo")?),
+            ListingMode::NoDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .keys
+        .into_iter()
+        .map(|o| o.key)
+        .collect();
+    assert!(objects.is_empty());
+
+    Ok(())
+}
+
 #[test_context(MaybeEnabledStorage)]
 #[tokio::test]
 async fn delete_non_exising_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
@@ -265,6 +397,80 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
     Ok(())
 }
 
+/// Tests that delete_prefix() will delete all objects matching a prefix, including
+/// partial prefixes (i.e. "/foo" matches "/foobar").
+#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
+#[tokio::test]
+async fn delete_prefix(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => {
+            anyhow::bail!("S3 init failed: {e:?}")
+        }
+    };
+
+    let cancel = CancellationToken::new();
+    let test_client = Arc::clone(&ctx.enabled.client);
+
+    /// Asserts that the S3 listing matches the given paths.
+    macro_rules! assert_list {
+        ($expect:expr) => {{
+            let listing = test_client
+                .list(None, ListingMode::NoDelimiter, None, &cancel)
+                .await?
+                .keys
+                .into_iter()
+                .map(|o| o.key)
+                .collect();
+            assert_eq!($expect, listing);
+        }};
+    }
+
+    // We start with the full set of uploaded files.
+    let mut expect = ctx.remote_blobs.clone();
+
+    // Deleting a non-existing prefix should do nothing.
+    test_client
+        .delete_prefix(&RemotePath::from_string("xyz")?, &cancel)
+        .await?;
+    assert_list!(expect);
+
+    // Prefixes are case-sensitive.
+    test_client
+        .delete_prefix(&RemotePath::from_string("Folder")?, &cancel)
+        .await?;
+    assert_list!(expect);
+
+    // Deleting a path which overlaps with an existing object should do nothing. We pick the first
+    // path in the set as our common prefix.
+    let path = expect.iter().next().expect("empty set").clone().join("xyz");
+    test_client.delete_prefix(&path, &cancel).await?;
+    assert_list!(expect);
+
+    // Deleting an exact path should work. We pick the first path in the set.
+    let path = expect.iter().next().expect("empty set").clone();
+    test_client.delete_prefix(&path, &cancel).await?;
+    expect.remove(&path);
+    assert_list!(expect);
+
+    // Deleting a prefix should delete all matching objects.
+    test_client
+        .delete_prefix(&RemotePath::from_string("folder0/blob_")?, &cancel)
+        .await?;
+    expect.retain(|p| !p.get_path().as_str().starts_with("folder0/"));
+    assert_list!(expect);
+
+    // Deleting a common prefix should delete all objects.
+    test_client
+        .delete_prefix(&RemotePath::from_string("fold")?, &cancel)
+        .await?;
+    expect.clear();
+    assert_list!(expect);
+
+    Ok(())
+}
+
 #[test_context(MaybeEnabledStorage)]
 #[tokio::test]
 async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 9d9852c525..0567f8f3a7 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -11,6 +11,7 @@ use pageserver_api::shard::{
 };
 use pageserver_api::upcall_api::ReAttachResponseTenant;
 use rand::{distributions::Alphanumeric, Rng};
+use remote_storage::TimeoutOrCancel;
 use std::borrow::Cow;
 use std::cmp::Ordering;
 use std::collections::{BTreeMap, HashMap, HashSet};
@@ -1350,47 +1351,17 @@ impl TenantManager {
         }
     }
 
-    async fn delete_tenant_remote(
-        &self,
-        tenant_shard_id: TenantShardId,
-    ) -> Result<(), DeleteTenantError> {
-        let remote_path = remote_tenant_path(&tenant_shard_id);
-        let mut keys_stream = self.resources.remote_storage.list_streaming(
-            Some(&remote_path),
-            remote_storage::ListingMode::NoDelimiter,
-            None,
-            &self.cancel,
-        );
-        while let Some(chunk) = keys_stream.next().await {
-            let keys = match chunk {
-                Ok(listing) => listing.keys,
-                Err(remote_storage::DownloadError::Cancelled) => {
-                    return Err(DeleteTenantError::Cancelled)
-                }
-                Err(remote_storage::DownloadError::NotFound) => return Ok(()),
-                Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
-            };
-
-            if keys.is_empty() {
-                tracing::info!("Remote storage already deleted");
-            } else {
-                tracing::info!("Deleting {} keys from remote storage", keys.len());
-                let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
-                self.resources
-                    .remote_storage
-                    .delete_objects(&keys, &self.cancel)
-                    .await?;
-            }
-        }
-
-        Ok(())
-    }
-
     /// If a tenant is attached, detach it.  Then remove its data from remote storage.
     ///
     /// A tenant is considered deleted once it is gone from remote storage.  It is the caller's
     /// responsibility to avoid trying to attach the tenant again or use it any way once deletion
     /// has started: this operation is not atomic, and must be retried until it succeeds.
+    ///
+    /// As a special case, if an unsharded tenant ID is given for a sharded tenant, it will remove
+    /// all tenant shards in remote storage (removing all paths with the tenant prefix). The storage
+    /// controller uses this to purge all remote tenant data, including any stale parent shards that
+    /// may remain after splits. Ideally, this special case would be handled elsewhere. See:
+    /// <https://github.com/neondatabase/neon/pull/9394>.
     pub(crate) async fn delete_tenant(
         &self,
         tenant_shard_id: TenantShardId,
@@ -1442,25 +1413,29 @@ impl TenantManager {
         //   in 500 responses to delete requests.
         // - We keep the `SlotGuard` during this I/O, so that if a concurrent delete request comes in, it will
         //   503/retry, rather than kicking off a wasteful concurrent deletion.
-        match backoff::retry(
-            || async move { self.delete_tenant_remote(tenant_shard_id).await },
-            |e| match e {
-                DeleteTenantError::Cancelled => true,
-                DeleteTenantError::SlotError(_) => {
-                    unreachable!("Remote deletion doesn't touch slots")
-                }
-                _ => false,
+        // NB: this also deletes partial prefixes, i.e. a <tenant_id> path will delete all
+        // <tenant_id>_<shard_id>/* objects. See method comment for why.
+        backoff::retry(
+            || async move {
+                self.resources
+                    .remote_storage
+                    .delete_prefix(&remote_tenant_path(&tenant_shard_id), &self.cancel)
+                    .await
             },
+            |_| false, // backoff::retry handles cancellation
             1,
             3,
             &format!("delete_tenant[tenant_shard_id={tenant_shard_id}]"),
             &self.cancel,
         )
         .await
-        {
-            Some(r) => r,
-            None => Err(DeleteTenantError::Cancelled),
-        }
+        .unwrap_or(Err(TimeoutOrCancel::Cancel.into()))
+        .map_err(|err| {
+            if TimeoutOrCancel::caused_by_cancel(&err) {
+                return DeleteTenantError::Cancelled;
+            }
+            DeleteTenantError::Other(err)
+        })
     }
 
     #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 25e1fb5e1f..ab2c3b5e48 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2862,17 +2862,12 @@ impl Service {
         let _tenant_lock =
             trace_exclusive_lock(&self.tenant_op_locks, tenant_id, TenantOperations::Delete).await;
 
-        // Detach all shards
-        let (detach_waiters, shard_ids, node) = {
-            let mut shard_ids = Vec::new();
+        // Detach all shards. This also deletes local pageserver shard data.
+        let (detach_waiters, node) = {
             let mut detach_waiters = Vec::new();
             let mut locked = self.inner.write().unwrap();
             let (nodes, tenants, scheduler) = locked.parts_mut();
-            for (tenant_shard_id, shard) in
-                tenants.range_mut(TenantShardId::tenant_range(tenant_id))
-            {
-                shard_ids.push(*tenant_shard_id);
-
+            for (_, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
                 // Update the tenant's intent to remove all attachments
                 shard.policy = PlacementPolicy::Detached;
                 shard
@@ -2892,7 +2887,7 @@ impl Service {
             let node = nodes
                 .get(&node_id)
                 .expect("Pageservers may not be deleted while lock is active");
-            (detach_waiters, shard_ids, node.clone())
+            (detach_waiters, node.clone())
         };
 
         // This reconcile wait can fail in a few ways:
@@ -2907,38 +2902,34 @@ impl Service {
         self.await_waiters(detach_waiters, RECONCILE_TIMEOUT)
             .await?;
 
-        let locations = shard_ids
-            .into_iter()
-            .map(|s| (s, node.clone()))
-            .collect::<Vec<_>>();
-        let results = self.tenant_for_shards_api(
-            locations,
-            |tenant_shard_id, client| async move { client.tenant_delete(tenant_shard_id).await },
-            1,
-            3,
-            RECONCILE_TIMEOUT,
-            &self.cancel,
-        )
-        .await;
-        for result in results {
-            match result {
-                Ok(StatusCode::ACCEPTED) => {
-                    // This should never happen: we waited for detaches to finish above
-                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
-                        "Unexpectedly still attached on {}",
-                        node
-                    )));
-                }
-                Ok(_) => {}
-                Err(mgmt_api::Error::Cancelled) => {
-                    return Err(ApiError::ShuttingDown);
-                }
-                Err(e) => {
-                    // This is unexpected: remote deletion should be infallible, unless the object store
-                    // at large is unavailable.
-                    tracing::error!("Error deleting via node {}: {e}", node);
-                    return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
-                }
+        // Delete the entire tenant (all shards) from remote storage via a random pageserver.
+        // Passing an unsharded tenant ID will cause the pageserver to remove all remote paths with
+        // the tenant ID prefix, including all shards (even possibly stale ones).
+        match node
+            .with_client_retries(
+                |client| async move {
+                    client
+                        .tenant_delete(TenantShardId::unsharded(tenant_id))
+                        .await
+                },
+                &self.config.jwt_token,
+                1,
+                3,
+                RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await
+            .unwrap_or(Err(mgmt_api::Error::Cancelled))
+        {
+            Ok(_) => {}
+            Err(mgmt_api::Error::Cancelled) => {
+                return Err(ApiError::ShuttingDown);
+            }
+            Err(e) => {
+                // This is unexpected: remote deletion should be infallible, unless the object store
+                // at large is unavailable.
+                tracing::error!("Error deleting via node {node}: {e}");
+                return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
             }
         }
 
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 377a95fbeb..4c4306be9e 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -303,9 +303,10 @@ def assert_prefix_empty(
     remote_storage: Optional[RemoteStorage],
     prefix: Optional[str] = None,
     allowed_postfix: Optional[str] = None,
+    delimiter: str = "/",
 ) -> None:
     assert remote_storage is not None
-    response = list_prefix(remote_storage, prefix)
+    response = list_prefix(remote_storage, prefix, delimiter)
     keys = response["KeyCount"]
     objects: list[ObjectTypeDef] = response.get("Contents", [])
     common_prefixes = response.get("CommonPrefixes", [])
@@ -338,16 +339,18 @@ def assert_prefix_empty(
             if not (allowed_postfix.endswith(key)):
                 filtered_count += 1
 
-    assert (
-        filtered_count == 0
-    ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
+    assert filtered_count == 0, f"remote prefix {prefix} is not empty: {objects}"
 
 
 # remote_storage must not be None, but that's easier for callers to make mypy happy
-def assert_prefix_not_empty(remote_storage: Optional[RemoteStorage], prefix: Optional[str] = None):
+def assert_prefix_not_empty(
+    remote_storage: Optional[RemoteStorage],
+    prefix: Optional[str] = None,
+    delimiter: str = "/",
+):
     assert remote_storage is not None
     response = list_prefix(remote_storage, prefix)
-    assert response["KeyCount"] != 0, f"remote dir with prefix {prefix} is empty: {response}"
+    assert response["KeyCount"] != 0, f"remote prefix {prefix} is empty: {response}"
 
 
 def list_prefix(
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 294c1248c5..f486327445 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -20,6 +20,7 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
 from fixtures.utils import run_pg_bench_small, wait_until
+from fixtures.workload import Workload
 from requests.exceptions import ReadTimeout
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
@@ -404,3 +405,57 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, make_httpserver, neon_env_builder
         cloud_admin_api_token=cloud_admin_token,
     )
     assert healthy
+
+
+def test_tenant_delete_stale_shards(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    """
+    Deleting a tenant should also delete any stale (pre-split) shards from remote storage.
+    """
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    env = neon_env_builder.init_start()
+
+    # Create an unsharded tenant.
+    tenant_id, timeline_id = env.create_tenant()
+
+    # Write some data.
+    workload = Workload(env, tenant_id, timeline_id, branch_name="main")
+    workload.init()
+    workload.write_rows(256)
+    workload.validate()
+
+    assert_prefix_not_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(("tenants", str(tenant_id))),
+    )
+
+    # Upload a heatmap as well.
+    env.pageserver.http_client().tenant_heatmap_upload(tenant_id)
+
+    # Split off a few shards, in two rounds.
+    env.storage_controller.tenant_shard_split(tenant_id, shard_count=4)
+    env.storage_controller.tenant_shard_split(tenant_id, shard_count=16)
+
+    # Delete the tenant. This should also delete data for the unsharded and count=4 parents.
+    env.storage_controller.pageserver_api().tenant_delete(tenant_id=tenant_id)
+
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(("tenants", str(tenant_id))),
+        delimiter="",  # match partial prefixes, i.e. all shards
+    )
+
+    dirs = list(env.pageserver.tenant_dir(None).glob(f"{tenant_id}*"))
+    assert dirs == [], f"found tenant directories: {dirs}"
+
+    # The initial tenant created by the test harness should still be there.
+    # Only the tenant we deleted should be removed.
+    assert_prefix_not_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(("tenants", str(env.initial_tenant))),
+    )
+    dirs = list(env.pageserver.tenant_dir(None).glob(f"{env.initial_tenant}*"))
+    assert dirs != [], "missing initial tenant directory"
+
+    env.stop()

From 299cde899b7b9a31723508afdf7b9e0f0be13912 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 17 Oct 2024 17:19:18 +0200
Subject: [PATCH 08/48] safekeeper: flush WAL on compute disconnect (#9436)

## Problem

In #9259, we found that the `check_safekeepers_synced` fast path could
result in a lower basebackup LSN than the `flush_lsn` reported by
Safekeepers in `VoteResponse`, causing the compute to panic once on
startup.

This would happen if the Safekeeper had unflushed WAL records due to a
compute disconnect. The `TIMELINE_STATUS` query would report a
`flush_lsn` below these unflushed records, while `VoteResponse` would
flush the WAL and report the advanced `flush_lsn`. See
https://github.com/neondatabase/neon/issues/9259#issuecomment-2410849032.

## Summary of changes

Flush the WAL if the compute disconnects during WAL processing.
---
 safekeeper/src/receive_wal.rs | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index e35f806e90..2a9ca85299 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -498,21 +498,18 @@ impl WalAcceptor {
         // we will send keepalives by replying to these requests once per second.
         let mut next_keepalive = Instant::now();
 
-        loop {
-            let opt_msg = self.msg_rx.recv().await;
-            if opt_msg.is_none() {
-                return Ok(()); // chan closed, streaming terminated
-            }
-            let mut next_msg = opt_msg.unwrap();
-
+        while let Some(mut next_msg) = self.msg_rx.recv().await {
             // Update walreceiver state in shmem for reporting.
             if let ProposerAcceptorMessage::Elected(_) = &next_msg {
                 walreceiver_guard.get().status = WalReceiverStatus::Streaming;
             }
 
             let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) {
-                // loop through AppendRequest's while it's readily available to
-                // write as many WAL as possible without fsyncing
+                // Loop through AppendRequests while available to write as many WAL records as
+                // possible without fsyncing.
+                //
+                // Make sure the WAL is flushed before returning, see:
+                // https://github.com/neondatabase/neon/issues/9259
                 //
                 // Note: this will need to be rewritten if we want to read non-AppendRequest messages here.
                 // Otherwise, we might end up in a situation where we read a message, but don't
@@ -522,7 +519,7 @@ impl WalAcceptor {
 
                     if let Some(reply) = self.tli.process_msg(&noflush_msg).await? {
                         if self.reply_tx.send(reply).await.is_err() {
-                            return Ok(()); // chan closed, streaming terminated
+                            break; // disconnected, flush WAL and return on next send/recv
                         }
                     }
 
@@ -531,11 +528,13 @@ impl WalAcceptor {
                         break;
                     }
 
+                    // continue pulling AppendRequests if available
                     match self.msg_rx.try_recv() {
                         Ok(msg) => next_msg = msg,
                         Err(TryRecvError::Empty) => break,
-                        Err(TryRecvError::Disconnected) => return Ok(()), // chan closed, streaming terminated
-                    }
+                        // on disconnect, flush WAL and return on next send/recv
+                        Err(TryRecvError::Disconnected) => break,
+                    };
                 }
 
                 // flush all written WAL to the disk
@@ -555,5 +554,6 @@ impl WalAcceptor {
                 next_keepalive = Instant::now() + KEEPALIVE_INTERVAL;
             }
         }
+        Ok(())
     }
 }

From 858867c62771e7f24c3d33820a8ca87c5f4f146f Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 17 Oct 2024 16:35:19 +0100
Subject: [PATCH 09/48] Add logging of installed_extensions (#9438)

Simple PR to log installed_extensions statistics.

in the following format:
```
2024-10-17T13:53:02.860595Z  INFO [NEON_EXT_STAT] {"extensions":[{"extname":"plpgsql","versions":["1.0"],"n_databases":2},{"extname":"neon","versions":["1.5"],"n_databases":1}]}
```
---
 compute_tools/src/compute.rs              | 28 +++++------------------
 compute_tools/src/installed_extensions.rs | 21 +++++++++++++++++
 2 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 285be56264..6aec008f3a 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -34,6 +34,7 @@ use nix::sys::signal::{kill, Signal};
 use remote_storage::{DownloadError, RemotePath};
 
 use crate::checker::create_availability_check_data;
+use crate::installed_extensions::get_installed_extensions_sync;
 use crate::local_proxy;
 use crate::logger::inlinify;
 use crate::pg_helpers::*;
@@ -1121,6 +1122,11 @@ impl ComputeNode {
                 self.pg_reload_conf()?;
             }
             self.post_apply_config()?;
+
+            let connstr = self.connstr.clone();
+            thread::spawn(move || {
+                get_installed_extensions_sync(connstr).context("get_installed_extensions")
+            });
         }
 
         let startup_end_time = Utc::now();
@@ -1484,28 +1490,6 @@ LIMIT 100",
             info!("Pageserver config changed");
         }
     }
-
-    // Gather info about installed extensions
-    pub fn get_installed_extensions(&self) -> Result<()> {
-        let connstr = self.connstr.clone();
-
-        let rt = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .expect("failed to create runtime");
-        let result = rt
-            .block_on(crate::installed_extensions::get_installed_extensions(
-                connstr,
-            ))
-            .expect("failed to get installed extensions");
-
-        info!(
-            "{}",
-            serde_json::to_string(&result).expect("failed to serialize extensions list")
-        );
-
-        Ok(())
-    }
 }
 
 pub fn forward_termination_signal() {
diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs
index 72578b1f34..877f99bff7 100644
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -1,6 +1,7 @@
 use compute_api::responses::{InstalledExtension, InstalledExtensions};
 use std::collections::HashMap;
 use std::collections::HashSet;
+use tracing::info;
 use url::Url;
 
 use anyhow::Result;
@@ -79,3 +80,23 @@ pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtension
     })
     .await?
 }
+
+// Gather info about installed extensions
+pub fn get_installed_extensions_sync(connstr: Url) -> Result<()> {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .expect("failed to create runtime");
+    let result = rt
+        .block_on(crate::installed_extensions::get_installed_extensions(
+            connstr,
+        ))
+        .expect("failed to get installed extensions");
+
+    info!(
+        "[NEON_EXT_STAT] {}",
+        serde_json::to_string(&result).expect("failed to serialize extensions list")
+    );
+
+    Ok(())
+}

From 63b3491c1b489487e9d94b8499f401cd57e12290 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 17 Oct 2024 12:22:44 -0400
Subject: [PATCH 10/48] refactor(pageserver): remove aux v1 code path (#9424)

Part of the aux v1 retirement
https://github.com/neondatabase/neon/issues/8623

## Summary of changes

Remove write/read path for aux v1, but keeping the config item and the
index part field for now.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/models.rs             |   2 -
 pageserver/src/http/routes.rs                 |  32 --
 pageserver/src/pgdatadir_mapping.rs           | 323 +++------------
 pageserver/src/tenant.rs                      | 380 +-----------------
 .../src/tenant/remote_timeline_client.rs      |  14 +-
 .../tenant/remote_timeline_client/index.rs    |   4 -
 pageserver/src/tenant/timeline.rs             |  51 +--
 pageserver/src/tenant/timeline/delete.rs      |   2 -
 pageserver/src/walredo/apply_neon.rs          |  71 +---
 test_runner/regress/test_aux_files.py         |  78 ----
 10 files changed, 60 insertions(+), 897 deletions(-)
 delete mode 100644 test_runner/regress/test_aux_files.py

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 3ec9cac2c3..5b0b6bebe3 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -743,8 +743,6 @@ pub struct TimelineInfo {
     // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does
     // not deny unknown fields by default so it's safe to set the field to some value, though it won't be
     // read.
-    /// The last aux file policy being used on this timeline
-    pub last_aux_file_policy: Option<AuxFilePolicy>,
     pub is_archived: Option<bool>,
 }
 
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 36a6ed427b..e6663ef56f 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -18,7 +18,6 @@ use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::virtual_file::IoMode;
-use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
 use pageserver_api::models::IngestAuxFilesRequest;
 use pageserver_api::models::ListAuxFilesRequest;
@@ -474,8 +473,6 @@ async fn build_timeline_info_common(
         is_archived: Some(is_archived),
 
         walreceiver_status,
-
-        last_aux_file_policy: timeline.last_aux_file_policy.load(),
     };
     Ok(info)
 }
@@ -2399,31 +2396,6 @@ async fn post_tracing_event_handler(
     json_response(StatusCode::OK, ())
 }
 
-async fn force_aux_policy_switch_handler(
-    mut r: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    check_permission(&r, None)?;
-    let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
-    let timeline_id: TimelineId = parse_request_param(&r, "timeline_id")?;
-    let policy: AuxFilePolicy = json_request(&mut r).await?;
-
-    let state = get_state(&r);
-
-    let tenant = state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?;
-    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-    let timeline =
-        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
-            .await?;
-    timeline
-        .do_switch_aux_policy(policy)
-        .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, ())
-}
-
 async fn put_io_engine_handler(
     mut r: Request<Body>,
     _cancel: CancellationToken,
@@ -3136,10 +3108,6 @@ pub fn make_router(
         )
         .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
         .put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler))
-        .put(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
-            |r| api_handler(r, force_aux_policy_switch_handler),
-        )
         .get("/v1/utilization", |r| api_handler(r, get_utilization))
         .post(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 900da5beab..f2a11e65c1 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -22,7 +22,6 @@ use pageserver_api::key::{
     CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
 use pageserver_api::keyspace::SparseKeySpace;
-use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -33,7 +32,7 @@ use std::ops::ControlFlow;
 use std::ops::Range;
 use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, info, trace, warn};
+use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
 use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -677,21 +676,6 @@ impl Timeline {
         self.get(CHECKPOINT_KEY, lsn, ctx).await
     }
 
-    async fn list_aux_files_v1(
-        &self,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
-        match self.get(AUX_FILES_KEY, lsn, ctx).await {
-            Ok(buf) => Ok(AuxFilesDirectory::des(&buf)?.files),
-            Err(e) => {
-                // This is expected: historical databases do not have the key.
-                debug!("Failed to get info about AUX files: {}", e);
-                Ok(HashMap::new())
-            }
-        }
-    }
-
     async fn list_aux_files_v2(
         &self,
         lsn: Lsn,
@@ -722,10 +706,7 @@ impl Timeline {
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Result<(), PageReconstructError> {
-        let current_policy = self.last_aux_file_policy.load();
-        if let Some(AuxFilePolicy::V2) | Some(AuxFilePolicy::CrossValidation) = current_policy {
-            self.list_aux_files_v2(lsn, ctx).await?;
-        }
+        self.list_aux_files_v2(lsn, ctx).await?;
         Ok(())
     }
 
@@ -734,51 +715,7 @@ impl Timeline {
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
-        let current_policy = self.last_aux_file_policy.load();
-        match current_policy {
-            Some(AuxFilePolicy::V1) => {
-                let res = self.list_aux_files_v1(lsn, ctx).await?;
-                let empty_str = if res.is_empty() { ", empty" } else { "" };
-                warn!(
-                    "this timeline is using deprecated aux file policy V1 (policy=v1{empty_str})"
-                );
-                Ok(res)
-            }
-            None => {
-                let res = self.list_aux_files_v1(lsn, ctx).await?;
-                if !res.is_empty() {
-                    warn!("this timeline is using deprecated aux file policy V1 (policy=None)");
-                }
-                Ok(res)
-            }
-            Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
-            Some(AuxFilePolicy::CrossValidation) => {
-                let v1_result = self.list_aux_files_v1(lsn, ctx).await;
-                let v2_result = self.list_aux_files_v2(lsn, ctx).await;
-                match (v1_result, v2_result) {
-                    (Ok(v1), Ok(v2)) => {
-                        if v1 != v2 {
-                            tracing::error!(
-                                "unmatched aux file v1 v2 result:\nv1 {v1:?}\nv2 {v2:?}"
-                            );
-                            return Err(PageReconstructError::Other(anyhow::anyhow!(
-                                "unmatched aux file v1 v2 result"
-                            )));
-                        }
-                        Ok(v1)
-                    }
-                    (Ok(_), Err(v2)) => {
-                        tracing::error!("aux file v1 returns Ok while aux file v2 returns an err");
-                        Err(v2)
-                    }
-                    (Err(v1), Ok(_)) => {
-                        tracing::error!("aux file v2 returns Ok while aux file v1 returns an err");
-                        Err(v1)
-                    }
-                    (Err(_), Err(v2)) => Err(v2),
-                }
-            }
-        }
+        self.list_aux_files_v2(lsn, ctx).await
     }
 
     pub(crate) async fn get_replorigins(
@@ -954,9 +891,6 @@ impl Timeline {
 
         result.add_key(CONTROLFILE_KEY);
         result.add_key(CHECKPOINT_KEY);
-        if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
-            result.add_key(AUX_FILES_KEY);
-        }
 
         // Add extra keyspaces in the test cases. Some test cases write keys into the storage without
         // creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace`
@@ -1166,9 +1100,6 @@ impl<'a> DatadirModification<'a> {
         self.pending_directory_entries.push((DirectoryKind::Db, 0));
         self.put(DBDIR_KEY, Value::Image(buf.into()));
 
-        // Create AuxFilesDirectory
-        self.init_aux_dir()?;
-
         let buf = if self.tline.pg_version >= 17 {
             TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 {
                 xids: HashSet::new(),
@@ -1347,9 +1278,6 @@ impl<'a> DatadirModification<'a> {
             // 'true', now write the updated 'dbdirs' map back.
             let buf = DbDirectory::ser(&dbdir)?;
             self.put(DBDIR_KEY, Value::Image(buf.into()));
-
-            // Create AuxFilesDirectory as well
-            self.init_aux_dir()?;
         }
         if r.is_none() {
             // Create RelDirectory
@@ -1726,200 +1654,60 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
-    pub fn init_aux_dir(&mut self) -> anyhow::Result<()> {
-        if let AuxFilePolicy::V2 = self.tline.get_switch_aux_file_policy() {
-            return Ok(());
-        }
-        let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
-            files: HashMap::new(),
-        })?;
-        self.pending_directory_entries
-            .push((DirectoryKind::AuxFiles, 0));
-        self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
-        Ok(())
-    }
-
     pub async fn put_file(
         &mut self,
         path: &str,
         content: &[u8],
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        let switch_policy = self.tline.get_switch_aux_file_policy();
-
-        let policy = {
-            let current_policy = self.tline.last_aux_file_policy.load();
-            // Allowed switch path:
-            // * no aux files -> v1/v2/cross-validation
-            // * cross-validation->v2
-
-            let current_policy = if current_policy.is_none() {
-                // This path will only be hit once per tenant: we will decide the final policy in this code block.
-                // The next call to `put_file` will always have `last_aux_file_policy != None`.
-                let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
-                let aux_files_key_v1 = self.tline.list_aux_files_v1(lsn, ctx).await?;
-                if aux_files_key_v1.is_empty() {
-                    None
-                } else {
-                    warn!("this timeline is using deprecated aux file policy V1 (detected existing v1 files)");
-                    self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
-                    Some(AuxFilePolicy::V1)
-                }
-            } else {
-                current_policy
-            };
-
-            if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
-                self.tline.do_switch_aux_policy(switch_policy)?;
-                info!(current=?current_policy, next=?switch_policy, "switching aux file policy");
-                switch_policy
-            } else {
-                // This branch handles non-valid migration path, and the case that switch_policy == current_policy.
-                // And actually, because the migration path always allow unspecified -> *, this unwrap_or will never be hit.
-                current_policy.unwrap_or(AuxFilePolicy::default_tenant_config())
-            }
+        let key = aux_file::encode_aux_file_key(path);
+        // retrieve the key from the engine
+        let old_val = match self.get(key, ctx).await {
+            Ok(val) => Some(val),
+            Err(PageReconstructError::MissingKey(_)) => None,
+            Err(e) => return Err(e.into()),
         };
-
-        if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy {
-            let key = aux_file::encode_aux_file_key(path);
-            // retrieve the key from the engine
-            let old_val = match self.get(key, ctx).await {
-                Ok(val) => Some(val),
-                Err(PageReconstructError::MissingKey(_)) => None,
-                Err(e) => return Err(e.into()),
-            };
-            let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
-                aux_file::decode_file_value(old_val)?
+        let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
+            aux_file::decode_file_value(old_val)?
+        } else {
+            Vec::new()
+        };
+        let mut other_files = Vec::with_capacity(files.len());
+        let mut modifying_file = None;
+        for file @ (p, content) in files {
+            if path == p {
+                assert!(
+                    modifying_file.is_none(),
+                    "duplicated entries found for {}",
+                    path
+                );
+                modifying_file = Some(content);
             } else {
-                Vec::new()
-            };
-            let mut other_files = Vec::with_capacity(files.len());
-            let mut modifying_file = None;
-            for file @ (p, content) in files {
-                if path == p {
-                    assert!(
-                        modifying_file.is_none(),
-                        "duplicated entries found for {}",
-                        path
-                    );
-                    modifying_file = Some(content);
-                } else {
-                    other_files.push(file);
-                }
+                other_files.push(file);
             }
-            let mut new_files = other_files;
-            match (modifying_file, content.is_empty()) {
-                (Some(old_content), false) => {
-                    self.tline
-                        .aux_file_size_estimator
-                        .on_update(old_content.len(), content.len());
-                    new_files.push((path, content));
-                }
-                (Some(old_content), true) => {
-                    self.tline
-                        .aux_file_size_estimator
-                        .on_remove(old_content.len());
-                    // not adding the file key to the final `new_files` vec.
-                }
-                (None, false) => {
-                    self.tline.aux_file_size_estimator.on_add(content.len());
-                    new_files.push((path, content));
-                }
-                (None, true) => warn!("removing non-existing aux file: {}", path),
-            }
-            let new_val = aux_file::encode_file_value(&new_files)?;
-            self.put(key, Value::Image(new_val.into()));
         }
-
-        if let AuxFilePolicy::V1 | AuxFilePolicy::CrossValidation = policy {
-            let file_path = path.to_string();
-            let content = if content.is_empty() {
-                None
-            } else {
-                Some(Bytes::copy_from_slice(content))
-            };
-
-            let n_files;
-            let mut aux_files = self.tline.aux_files.lock().await;
-            if let Some(mut dir) = aux_files.dir.take() {
-                // We already updated aux files in `self`: emit a delta and update our latest value.
-                dir.upsert(file_path.clone(), content.clone());
-                n_files = dir.files.len();
-                if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
-                    self.put(
-                        AUX_FILES_KEY,
-                        Value::Image(Bytes::from(
-                            AuxFilesDirectory::ser(&dir).context("serialize")?,
-                        )),
-                    );
-                    aux_files.n_deltas = 0;
-                } else {
-                    self.put(
-                        AUX_FILES_KEY,
-                        Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }),
-                    );
-                    aux_files.n_deltas += 1;
-                }
-                aux_files.dir = Some(dir);
-            } else {
-                // Check if the AUX_FILES_KEY is initialized
-                match self.get(AUX_FILES_KEY, ctx).await {
-                    Ok(dir_bytes) => {
-                        let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
-                        // Key is already set, we may append a delta
-                        self.put(
-                            AUX_FILES_KEY,
-                            Value::WalRecord(NeonWalRecord::AuxFile {
-                                file_path: file_path.clone(),
-                                content: content.clone(),
-                            }),
-                        );
-                        dir.upsert(file_path, content);
-                        n_files = dir.files.len();
-                        aux_files.dir = Some(dir);
-                    }
-                    Err(
-                        e @ (PageReconstructError::Cancelled
-                        | PageReconstructError::AncestorLsnTimeout(_)),
-                    ) => {
-                        // Important that we do not interpret a shutdown error as "not found" and thereby
-                        // reset the map.
-                        return Err(e.into());
-                    }
-                    // Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
-                    // the original code assumes all other errors are missing keys. Therefore, we keep the code path
-                    // the same for now, though in theory, we should only match the `MissingKey` variant.
-                    Err(
-                        e @ (PageReconstructError::Other(_)
-                        | PageReconstructError::WalRedo(_)
-                        | PageReconstructError::MissingKey(_)),
-                    ) => {
-                        // Key is missing, we must insert an image as the basis for subsequent deltas.
-
-                        if !matches!(e, PageReconstructError::MissingKey(_)) {
-                            let e = utils::error::report_compact_sources(&e);
-                            tracing::warn!("treating error as if it was a missing key: {}", e);
-                        }
-
-                        let mut dir = AuxFilesDirectory {
-                            files: HashMap::new(),
-                        };
-                        dir.upsert(file_path, content);
-                        self.put(
-                            AUX_FILES_KEY,
-                            Value::Image(Bytes::from(
-                                AuxFilesDirectory::ser(&dir).context("serialize")?,
-                            )),
-                        );
-                        n_files = 1;
-                        aux_files.dir = Some(dir);
-                    }
-                }
+        let mut new_files = other_files;
+        match (modifying_file, content.is_empty()) {
+            (Some(old_content), false) => {
+                self.tline
+                    .aux_file_size_estimator
+                    .on_update(old_content.len(), content.len());
+                new_files.push((path, content));
             }
-
-            self.pending_directory_entries
-                .push((DirectoryKind::AuxFiles, n_files));
+            (Some(old_content), true) => {
+                self.tline
+                    .aux_file_size_estimator
+                    .on_remove(old_content.len());
+                // not adding the file key to the final `new_files` vec.
+            }
+            (None, false) => {
+                self.tline.aux_file_size_estimator.on_add(content.len());
+                new_files.push((path, content));
+            }
+            (None, true) => warn!("removing non-existing aux file: {}", path),
         }
+        let new_val = aux_file::encode_file_value(&new_files)?;
+        self.put(key, Value::Image(new_val.into()));
 
         Ok(())
     }
@@ -2089,12 +1877,6 @@ impl<'a> DatadirModification<'a> {
         self.tline.get(key, lsn, ctx).await
     }
 
-    /// Only used during unit tests, force putting a key into the modification.
-    #[cfg(test)]
-    pub(crate) fn put_for_test(&mut self, key: Key, val: Value) {
-        self.put(key, val);
-    }
-
     fn put(&mut self, key: Key, val: Value) {
         if Self::is_data_key(&key) {
             self.put_data(key.to_compact(), val)
@@ -2212,21 +1994,6 @@ struct RelDirectory {
     rels: HashSet<(Oid, u8)>,
 }
 
-#[derive(Debug, Serialize, Deserialize, Default, PartialEq)]
-pub(crate) struct AuxFilesDirectory {
-    pub(crate) files: HashMap<String, Bytes>,
-}
-
-impl AuxFilesDirectory {
-    pub(crate) fn upsert(&mut self, key: String, value: Option<Bytes>) {
-        if let Some(value) = value {
-            self.files.insert(key, value);
-        } else {
-            self.files.remove(&key);
-        }
-    }
-}
-
 #[derive(Debug, Serialize, Deserialize)]
 struct RelSizeEntry {
     nblocks: u32,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index baa2365658..1066d165cd 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -20,7 +20,6 @@ use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use pageserver_api::models;
-use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::LsnLease;
 use pageserver_api::models::TimelineArchivalState;
 use pageserver_api::models::TimelineState;
@@ -800,7 +799,6 @@ impl Tenant {
         index_part: Option<IndexPart>,
         metadata: TimelineMetadata,
         ancestor: Option<Arc<Timeline>>,
-        last_aux_file_policy: Option<AuxFilePolicy>,
         _ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let tenant_id = self.tenant_shard_id;
@@ -811,10 +809,6 @@ impl Tenant {
             ancestor.clone(),
             resources,
             CreateTimelineCause::Load,
-            // This could be derived from ancestor branch + index part. Though the only caller of `timeline_init_and_sync` is `load_remote_timeline`,
-            // there will potentially be other caller of this function in the future, and we don't know whether `index_part` or `ancestor` takes precedence.
-            // Therefore, we pass this field explicitly for now, and remove it once we fully migrate to aux file v2.
-            last_aux_file_policy,
         )?;
         let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
         anyhow::ensure!(
@@ -829,10 +823,6 @@ impl Tenant {
 
         if let Some(index_part) = index_part.as_ref() {
             timeline.remote_client.init_upload_queue(index_part)?;
-
-            timeline
-                .last_aux_file_policy
-                .store(index_part.last_aux_file_policy());
         } else {
             // No data on the remote storage, but we have local metadata file. We can end up
             // here with timeline_create being interrupted before finishing index part upload.
@@ -1403,15 +1393,12 @@ impl Tenant {
             None
         };
 
-        let last_aux_file_policy = index_part.last_aux_file_policy();
-
         self.timeline_init_and_sync(
             timeline_id,
             resources,
             Some(index_part),
             remote_metadata,
             ancestor,
-            last_aux_file_policy,
             ctx,
         )
         .await
@@ -1824,7 +1811,6 @@ impl Tenant {
             create_guard,
             initdb_lsn,
             None,
-            None,
         )
         .await
     }
@@ -3032,7 +3018,6 @@ impl Tenant {
         ancestor: Option<Arc<Timeline>>,
         resources: TimelineResources,
         cause: CreateTimelineCause,
-        last_aux_file_policy: Option<AuxFilePolicy>,
     ) -> anyhow::Result<Arc<Timeline>> {
         let state = match cause {
             CreateTimelineCause::Load => {
@@ -3061,7 +3046,6 @@ impl Tenant {
             resources,
             pg_version,
             state,
-            last_aux_file_policy,
             self.attach_wal_lag_cooldown.clone(),
             self.cancel.child_token(),
         );
@@ -3720,7 +3704,6 @@ impl Tenant {
                 timeline_create_guard,
                 start_lsn + 1,
                 Some(Arc::clone(src_timeline)),
-                src_timeline.last_aux_file_policy.load(),
             )
             .await?;
 
@@ -3914,7 +3897,6 @@ impl Tenant {
                 timeline_create_guard,
                 pgdata_lsn,
                 None,
-                None,
             )
             .await?;
 
@@ -3986,7 +3968,6 @@ impl Tenant {
         create_guard: TimelineCreateGuard<'a>,
         start_lsn: Lsn,
         ancestor: Option<Arc<Timeline>>,
-        last_aux_file_policy: Option<AuxFilePolicy>,
     ) -> anyhow::Result<UninitializedTimeline<'a>> {
         let tenant_shard_id = self.tenant_shard_id;
 
@@ -4002,7 +3983,6 @@ impl Tenant {
                 ancestor,
                 resources,
                 CreateTimelineCause::Load,
-                last_aux_file_policy,
             )
             .context("Failed to create timeline data structure")?;
 
@@ -4600,7 +4580,6 @@ mod tests {
 
     use super::*;
     use crate::keyspace::KeySpaceAccum;
-    use crate::pgdatadir_mapping::AuxFilesDirectory;
     use crate::repository::{Key, Value};
     use crate::tenant::harness::*;
     use crate::tenant::timeline::CompactFlags;
@@ -4609,7 +4588,7 @@ mod tests {
     use bytes::{Bytes, BytesMut};
     use hex_literal::hex;
     use itertools::Itertools;
-    use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
+    use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
     use pageserver_api::keyspace::KeySpace;
     use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
     use rand::{thread_rng, Rng};
@@ -4618,7 +4597,6 @@ mod tests {
     use tests::timeline::{GetVectoredError, ShutdownMode};
     use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
     use timeline::{DeltaLayerTestDesc, GcInfo};
-    use utils::bin_ser::BeSer;
     use utils::id::TenantId;
 
     static TEST_KEY: Lazy<Key> =
@@ -6422,16 +6400,9 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_branch_copies_dirty_aux_file_flag() {
-        let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag")
-            .await
-            .unwrap();
+    async fn test_aux_file_e2e() {
+        let harness = TenantHarness::create("test_aux_file_e2e").await.unwrap();
 
-        // the default aux file policy to switch is v2 if not set by the admins
-        assert_eq!(
-            harness.tenant_conf.switch_aux_file_policy,
-            AuxFilePolicy::default_tenant_config()
-        );
         let (tenant, ctx) = harness.load().await;
 
         let mut lsn = Lsn(0x08);
@@ -6441,9 +6412,6 @@ mod tests {
             .await
             .unwrap();
 
-        // no aux file is written at this point, so the persistent flag should be unset
-        assert_eq!(tline.last_aux_file_policy.load(), None);
-
         {
             lsn += 8;
             let mut modification = tline.begin_modification(lsn);
@@ -6454,30 +6422,6 @@ mod tests {
             modification.commit(&ctx).await.unwrap();
         }
 
-        // there is no tenant manager to pass the configuration through, so lets mimic it
-        tenant.set_new_location_config(
-            AttachedTenantConf::try_from(LocationConf::attached_single(
-                TenantConfOpt {
-                    switch_aux_file_policy: Some(AuxFilePolicy::V2),
-                    ..Default::default()
-                },
-                tenant.generation,
-                &pageserver_api::models::ShardParameters::default(),
-            ))
-            .unwrap(),
-        );
-
-        assert_eq!(
-            tline.get_switch_aux_file_policy(),
-            AuxFilePolicy::V2,
-            "wanted state has been updated"
-        );
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V2),
-            "aux file is written with switch_aux_file_policy unset (which is v2), so we should use v2 there"
-        );
-
         // we can read everything from the storage
         let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
         assert_eq!(
@@ -6495,12 +6439,6 @@ mod tests {
             modification.commit(&ctx).await.unwrap();
         }
 
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V2),
-            "keep v2 storage format when new files are written"
-        );
-
         let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
         assert_eq!(
             files.get("pg_logical/mappings/test2"),
@@ -6512,321 +6450,9 @@ mod tests {
             .await
             .unwrap();
 
-        // child copies the last flag even if that is not on remote storage yet
-        assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
-        assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
-
         let files = child.list_aux_files(lsn, &ctx).await.unwrap();
         assert_eq!(files.get("pg_logical/mappings/test1"), None);
         assert_eq!(files.get("pg_logical/mappings/test2"), None);
-
-        // even if we crash here without flushing parent timeline with it's new
-        // last_aux_file_policy we are safe, because child was never meant to access ancestor's
-        // files. the ancestor can even switch back to V1 because of a migration safely.
-    }
-
-    #[tokio::test]
-    async fn aux_file_policy_switch() {
-        let mut harness = TenantHarness::create("aux_file_policy_switch")
-            .await
-            .unwrap();
-        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode
-        let (tenant, ctx) = harness.load().await;
-
-        let mut lsn = Lsn(0x08);
-
-        let tline: Arc<Timeline> = tenant
-            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            None,
-            "no aux file is written so it should be unset"
-        );
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            modification
-                .put_file("pg_logical/mappings/test1", b"first", &ctx)
-                .await
-                .unwrap();
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        // there is no tenant manager to pass the configuration through, so lets mimic it
-        tenant.set_new_location_config(
-            AttachedTenantConf::try_from(LocationConf::attached_single(
-                TenantConfOpt {
-                    switch_aux_file_policy: Some(AuxFilePolicy::V2),
-                    ..Default::default()
-                },
-                tenant.generation,
-                &pageserver_api::models::ShardParameters::default(),
-            ))
-            .unwrap(),
-        );
-
-        assert_eq!(
-            tline.get_switch_aux_file_policy(),
-            AuxFilePolicy::V2,
-            "wanted state has been updated"
-        );
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::CrossValidation),
-            "dirty index_part.json reflected state is yet to be updated"
-        );
-
-        // we can still read the auxfile v1 before we ingest anything new
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
-        assert_eq!(
-            files.get("pg_logical/mappings/test1"),
-            Some(&bytes::Bytes::from_static(b"first"))
-        );
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            modification
-                .put_file("pg_logical/mappings/test2", b"second", &ctx)
-                .await
-                .unwrap();
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V2),
-            "ingesting a file should apply the wanted switch state when applicable"
-        );
-
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
-        assert_eq!(
-            files.get("pg_logical/mappings/test1"),
-            Some(&bytes::Bytes::from_static(b"first")),
-            "cross validation writes to both v1 and v2 so this should be available in v2"
-        );
-        assert_eq!(
-            files.get("pg_logical/mappings/test2"),
-            Some(&bytes::Bytes::from_static(b"second"))
-        );
-
-        // mimic again by trying to flip it from V2 to V1 (not switched to while ingesting a file)
-        tenant.set_new_location_config(
-            AttachedTenantConf::try_from(LocationConf::attached_single(
-                TenantConfOpt {
-                    switch_aux_file_policy: Some(AuxFilePolicy::V1),
-                    ..Default::default()
-                },
-                tenant.generation,
-                &pageserver_api::models::ShardParameters::default(),
-            ))
-            .unwrap(),
-        );
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            modification
-                .put_file("pg_logical/mappings/test2", b"third", &ctx)
-                .await
-                .unwrap();
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        assert_eq!(
-            tline.get_switch_aux_file_policy(),
-            AuxFilePolicy::V1,
-            "wanted state has been updated again, even if invalid request"
-        );
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V2),
-            "ingesting a file should apply the wanted switch state when applicable"
-        );
-
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
-        assert_eq!(
-            files.get("pg_logical/mappings/test1"),
-            Some(&bytes::Bytes::from_static(b"first"))
-        );
-        assert_eq!(
-            files.get("pg_logical/mappings/test2"),
-            Some(&bytes::Bytes::from_static(b"third"))
-        );
-
-        // mimic again by trying to flip it from from V1 to V2 (not switched to while ingesting a file)
-        tenant.set_new_location_config(
-            AttachedTenantConf::try_from(LocationConf::attached_single(
-                TenantConfOpt {
-                    switch_aux_file_policy: Some(AuxFilePolicy::V2),
-                    ..Default::default()
-                },
-                tenant.generation,
-                &pageserver_api::models::ShardParameters::default(),
-            ))
-            .unwrap(),
-        );
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            modification
-                .put_file("pg_logical/mappings/test3", b"last", &ctx)
-                .await
-                .unwrap();
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        assert_eq!(tline.get_switch_aux_file_policy(), AuxFilePolicy::V2);
-
-        assert_eq!(tline.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
-
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
-        assert_eq!(
-            files.get("pg_logical/mappings/test1"),
-            Some(&bytes::Bytes::from_static(b"first"))
-        );
-        assert_eq!(
-            files.get("pg_logical/mappings/test2"),
-            Some(&bytes::Bytes::from_static(b"third"))
-        );
-        assert_eq!(
-            files.get("pg_logical/mappings/test3"),
-            Some(&bytes::Bytes::from_static(b"last"))
-        );
-    }
-
-    #[tokio::test]
-    async fn aux_file_policy_force_switch() {
-        let mut harness = TenantHarness::create("aux_file_policy_force_switch")
-            .await
-            .unwrap();
-        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1;
-        let (tenant, ctx) = harness.load().await;
-
-        let mut lsn = Lsn(0x08);
-
-        let tline: Arc<Timeline> = tenant
-            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            None,
-            "no aux file is written so it should be unset"
-        );
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            modification
-                .put_file("pg_logical/mappings/test1", b"first", &ctx)
-                .await
-                .unwrap();
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        tline.do_switch_aux_policy(AuxFilePolicy::V2).unwrap();
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V2),
-            "dirty index_part.json reflected state is yet to be updated"
-        );
-
-        // lose all data from v1
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
-        assert_eq!(files.get("pg_logical/mappings/test1"), None);
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            modification
-                .put_file("pg_logical/mappings/test2", b"second", &ctx)
-                .await
-                .unwrap();
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        // read data ingested in v2
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
-        assert_eq!(
-            files.get("pg_logical/mappings/test2"),
-            Some(&bytes::Bytes::from_static(b"second"))
-        );
-        // lose all data from v1
-        assert_eq!(files.get("pg_logical/mappings/test1"), None);
-    }
-
-    #[tokio::test]
-    async fn aux_file_policy_auto_detect() {
-        let mut harness = TenantHarness::create("aux_file_policy_auto_detect")
-            .await
-            .unwrap();
-        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode
-        let (tenant, ctx) = harness.load().await;
-
-        let mut lsn = Lsn(0x08);
-
-        let tline: Arc<Timeline> = tenant
-            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            None,
-            "no aux file is written so it should be unset"
-        );
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
-                files: vec![(
-                    "test_file".to_string(),
-                    Bytes::copy_from_slice(b"test_file"),
-                )]
-                .into_iter()
-                .collect(),
-            })
-            .unwrap();
-            modification.put_for_test(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            modification
-                .put_file("pg_logical/mappings/test1", b"first", &ctx)
-                .await
-                .unwrap();
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V1),
-            "keep using v1 because there are aux files writting with v1"
-        );
-
-        // we can still read the auxfile v1
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
-        assert_eq!(
-            files.get("pg_logical/mappings/test1"),
-            Some(&bytes::Bytes::from_static(b"first"))
-        );
-        assert_eq!(
-            files.get("test_file"),
-            Some(&bytes::Bytes::from_static(b"test_file"))
-        );
     }
 
     #[tokio::test]
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 1f9ae40af5..5e9702bd3d 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -187,7 +187,7 @@ use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};
 
 pub(crate) use download::download_initdb_tar_zst;
-use pageserver_api::models::{AuxFilePolicy, TimelineArchivalState};
+use pageserver_api::models::TimelineArchivalState;
 use pageserver_api::shard::{ShardIndex, TenantShardId};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
@@ -628,18 +628,6 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
-    /// Launch an index-file upload operation in the background, with only the `aux_file_policy` flag updated.
-    pub(crate) fn schedule_index_upload_for_aux_file_policy_update(
-        self: &Arc<Self>,
-        last_aux_file_policy: Option<AuxFilePolicy>,
-    ) -> anyhow::Result<()> {
-        let mut guard = self.upload_queue.lock().unwrap();
-        let upload_queue = guard.initialized_mut()?;
-        upload_queue.dirty.last_aux_file_policy = last_aux_file_policy;
-        self.schedule_index_upload(upload_queue)?;
-        Ok(())
-    }
-
     /// Launch an index-file upload operation in the background, with only the `archived_at` field updated.
     ///
     /// Returns whether it is required to wait for the queue to be empty to ensure that the change is uploaded,
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index c51ff54919..3a74a4ed11 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -133,10 +133,6 @@ impl IndexPart {
     pub(crate) fn example() -> Self {
         Self::empty(TimelineMetadata::example())
     }
-
-    pub(crate) fn last_aux_file_policy(&self) -> Option<AuxFilePolicy> {
-        self.last_aux_file_policy
-    }
 }
 
 /// Metadata gathered for each of the layer files.
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2b4f949c76..d67a139dfa 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -28,9 +28,9 @@ use pageserver_api::{
     },
     keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
     models::{
-        AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, CompactionAlgorithmSettings,
-        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
-        InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState,
+        CompactionAlgorithm, CompactionAlgorithmSettings, DownloadRemoteLayersTaskInfo,
+        DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
+        LsnLease, TimelineState,
     },
     reltag::BlockNumber,
     shard::{ShardIdentity, ShardNumber, TenantShardId},
@@ -98,12 +98,12 @@ use crate::{
 use crate::{
     metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
 };
-use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
-use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey};
 use crate::{
-    pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
+    pgdatadir_mapping::DirectoryKind,
     virtual_file::{MaybeFatalIo, VirtualFile},
 };
+use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
+use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey};
 use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
 
 use crate::config::PageServerConf;
@@ -206,11 +206,6 @@ pub struct TimelineResources {
     pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
 }
 
-pub(crate) struct AuxFilesState {
-    pub(crate) dir: Option<AuxFilesDirectory>,
-    pub(crate) n_deltas: usize,
-}
-
 /// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL
 /// ingestion considerably, because WAL ingestion needs to check on most records if the record
 /// implicitly extends the relation.  At startup, `complete_as_of` is initialized to the current end
@@ -413,15 +408,9 @@ pub struct Timeline {
     timeline_get_throttle:
         Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
 
-    /// Keep aux directory cache to avoid it's reconstruction on each update
-    pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
-
     /// Size estimator for aux file v2
     pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
 
-    /// Indicate whether aux file v2 storage is enabled.
-    pub(crate) last_aux_file_policy: AtomicAuxFilePolicy,
-
     /// Some test cases directly place keys into the timeline without actually modifying the directory
     /// keys (i.e., DB_DIR). The test cases creating such keys will put the keyspaces here, so that
     /// these keys won't get garbage-collected during compaction/GC. This field only modifies the dense
@@ -2012,14 +2001,6 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length_for_ts)
     }
 
-    pub(crate) fn get_switch_aux_file_policy(&self) -> AuxFilePolicy {
-        let tenant_conf = self.tenant_conf.load();
-        tenant_conf
-            .tenant_conf
-            .switch_aux_file_policy
-            .unwrap_or(self.conf.default_tenant_conf.switch_aux_file_policy)
-    }
-
     pub(crate) fn get_lazy_slru_download(&self) -> bool {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
@@ -2152,7 +2133,6 @@ impl Timeline {
         resources: TimelineResources,
         pg_version: u32,
         state: TimelineState,
-        aux_file_policy: Option<AuxFilePolicy>,
         attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
         cancel: CancellationToken,
     ) -> Arc<Self> {
@@ -2282,15 +2262,8 @@ impl Timeline {
 
                 timeline_get_throttle: resources.timeline_get_throttle,
 
-                aux_files: tokio::sync::Mutex::new(AuxFilesState {
-                    dir: None,
-                    n_deltas: 0,
-                }),
-
                 aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
 
-                last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy),
-
                 #[cfg(test)]
                 extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
 
@@ -2301,10 +2274,6 @@ impl Timeline {
                 attach_wal_lag_cooldown,
             };
 
-            if aux_file_policy == Some(AuxFilePolicy::V1) {
-                warn!("this timeline is using deprecated aux file policy V1 (when loading the timeline)");
-            }
-
             result.repartition_threshold =
                 result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
 
@@ -4479,14 +4448,6 @@ impl Timeline {
     ) -> Result<(), detach_ancestor::Error> {
         detach_ancestor::complete(self, tenant, attempt, ctx).await
     }
-
-    /// Switch aux file policy and schedule upload to the index part.
-    pub(crate) fn do_switch_aux_policy(&self, policy: AuxFilePolicy) -> anyhow::Result<()> {
-        self.last_aux_file_policy.store(Some(policy));
-        self.remote_client
-            .schedule_index_upload_for_aux_file_policy_update(Some(policy))?;
-        Ok(())
-    }
 }
 
 impl Drop for Timeline {
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 305c5758cc..71b9e4e288 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -283,8 +283,6 @@ impl DeleteTimelineFlow {
                 // Important. We dont pass ancestor above because it can be missing.
                 // Thus we need to skip the validation here.
                 CreateTimelineCause::Delete,
-                // Aux file policy is not needed for deletion, assuming deletion does not read aux keyspace
-                None,
             )
             .context("create_timeline_struct")?;
 
diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs
index facf01004c..c067787f97 100644
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -1,8 +1,7 @@
-use crate::pgdatadir_mapping::AuxFilesDirectory;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
-use bytes::{BufMut, BytesMut};
+use bytes::BytesMut;
 use pageserver_api::key::Key;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants;
@@ -13,7 +12,6 @@ use postgres_ffi::v14::nonrelfile_utils::{
 };
 use postgres_ffi::BLCKSZ;
 use tracing::*;
-use utils::bin_ser::BeSer;
 use utils::lsn::Lsn;
 
 /// Can this request be served by neon redo functions
@@ -236,13 +234,9 @@ pub(crate) fn apply_in_neon(
                 LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid);
             }
         }
-        NeonWalRecord::AuxFile { file_path, content } => {
-            let mut dir = AuxFilesDirectory::des(page)?;
-            dir.upsert(file_path.clone(), content.clone());
-
-            page.clear();
-            let mut writer = page.writer();
-            dir.ser_into(&mut writer)?;
+        NeonWalRecord::AuxFile { .. } => {
+            // No-op: this record will never be created in aux v2.
+            warn!("AuxFile record should not be created in aux v2");
         }
         #[cfg(test)]
         NeonWalRecord::Test {
@@ -250,6 +244,7 @@ pub(crate) fn apply_in_neon(
             clear,
             will_init,
         } => {
+            use bytes::BufMut;
             if *will_init {
                 assert!(*clear, "init record must be clear to ensure correctness");
             }
@@ -261,59 +256,3 @@ pub(crate) fn apply_in_neon(
     }
     Ok(())
 }
-
-#[cfg(test)]
-mod test {
-    use bytes::Bytes;
-    use pageserver_api::key::AUX_FILES_KEY;
-
-    use super::*;
-    use std::collections::HashMap;
-
-    /// Test [`apply_in_neon`]'s handling of NeonWalRecord::AuxFile
-    #[test]
-    fn apply_aux_file_deltas() -> anyhow::Result<()> {
-        let base_dir = AuxFilesDirectory {
-            files: HashMap::from([
-                ("two".to_string(), Bytes::from_static(b"content0")),
-                ("three".to_string(), Bytes::from_static(b"contentX")),
-            ]),
-        };
-        let base_image = AuxFilesDirectory::ser(&base_dir)?;
-
-        let deltas = vec![
-            // Insert
-            NeonWalRecord::AuxFile {
-                file_path: "one".to_string(),
-                content: Some(Bytes::from_static(b"content1")),
-            },
-            // Update
-            NeonWalRecord::AuxFile {
-                file_path: "two".to_string(),
-                content: Some(Bytes::from_static(b"content99")),
-            },
-            // Delete
-            NeonWalRecord::AuxFile {
-                file_path: "three".to_string(),
-                content: None,
-            },
-        ];
-
-        let file_path = AUX_FILES_KEY;
-        let mut page = BytesMut::from_iter(base_image);
-
-        for record in deltas {
-            apply_in_neon(&record, Lsn(8), file_path, &mut page)?;
-        }
-
-        let reconstructed = AuxFilesDirectory::des(&page)?;
-        let expect = HashMap::from([
-            ("one".to_string(), Bytes::from_static(b"content1")),
-            ("two".to_string(), Bytes::from_static(b"content99")),
-        ]);
-
-        assert_eq!(reconstructed.files, expect);
-
-        Ok(())
-    }
-}
diff --git a/test_runner/regress/test_aux_files.py b/test_runner/regress/test_aux_files.py
deleted file mode 100644
index 91d674d0db..0000000000
--- a/test_runner/regress/test_aux_files.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from __future__ import annotations
-
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    AuxFileStore,
-    NeonEnvBuilder,
-    logical_replication_sync,
-)
-
-
-def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
-    env = neon_env_builder.init_start()
-    endpoint = env.endpoints.create_start("main")
-    client = env.pageserver.http_client()
-
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-
-    tenant_config = client.tenant_config(tenant_id).effective_config
-    tenant_config["switch_aux_file_policy"] = AuxFileStore.V2
-    client.set_tenant_config(tenant_id, tenant_config)
-    # aux file v2 is enabled on the write path, so for now, it should be unset (or null)
-    assert (
-        client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)["last_aux_file_policy"]
-        is None
-    )
-
-    pg_conn = endpoint.connect()
-    cur = pg_conn.cursor()
-
-    cur.execute("create table t(pk integer primary key, payload integer)")
-    cur.execute(
-        "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120));"
-    )
-    cur.execute("create publication pub1 for table t, replication_example")
-
-    # now start subscriber, aux files will be created at this point. TODO: find better ways of testing aux files (i.e., neon_test_utils)
-    # instead of going through the full logical replication process.
-    vanilla_pg.start()
-    vanilla_pg.safe_psql("create table t(pk integer primary key, payload integer)")
-    vanilla_pg.safe_psql(
-        "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120), testcolumn1 int, testcolumn2 int, testcolumn3 int);"
-    )
-    connstr = endpoint.connstr().replace("'", "''")
-    log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
-    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
-
-    # Wait logical replication channel to be established
-    logical_replication_sync(vanilla_pg, endpoint)
-    vanilla_pg.stop()
-    endpoint.stop()
-
-    with env.pageserver.http_client() as client:
-        # aux file v2 flag should be enabled at this point
-        assert (
-            client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"]
-            == AuxFileStore.V2
-        )
-    with env.pageserver.http_client() as client:
-        tenant_config = client.tenant_config(tenant_id).effective_config
-        tenant_config["switch_aux_file_policy"] = "V1"
-        client.set_tenant_config(tenant_id, tenant_config)
-        # the flag should still be enabled
-        assert (
-            client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
-                "last_aux_file_policy"
-            ]
-            == AuxFileStore.V2
-        )
-    env.pageserver.restart()
-    with env.pageserver.http_client() as client:
-        # aux file v2 flag should be persisted
-        assert (
-            client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
-                "last_aux_file_policy"
-            ]
-            == AuxFileStore.V2
-        )

From 24398bf0600223fb74fb3aa33ca4e4374209f84d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 17 Oct 2024 19:02:24 +0100
Subject: [PATCH 11/48] pageserver: detect & warn on loading an old index which
 is probably the result of a bad generation (#9383)

## Problem

The pageserver generally trusts the storage controller/control plane to
give it valid generations. However, sometimes it should be obvious that
a generation is bad, and for defense in depth we should detect that on
the pageserver.

This PR is part 1 of 2:
1. in this PR we detect and warn on such situations, but do not block
starting up the tenant. Once we have confidence that the check is not
firing unexpectedly in the field
2. part 2 of 2 will introduce a condition that refuses to start a tenant
in this situtation, and a test for that (maybe, if we can figure out how
to spoof an ancient mtime)

Related: #6951

## Summary of changes

- When loading an index older than 2 weeks, log an INFO message noting
that we will check for other indices
- When loading an index older than 2 weeks _and_ a newer-generation
index exists, log a warning.
---
 pageserver/src/http/routes.rs                 |  2 +-
 .../src/tenant/remote_timeline_client.rs      | 45 ++++++++++++++++++-
 .../tenant/remote_timeline_client/download.rs | 11 ++---
 3 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index e6663ef56f..8f928fd81b 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2251,7 +2251,7 @@ async fn tenant_scan_remote_handler(
                          %timeline_id))
             .await
             {
-                Ok((index_part, index_generation)) => {
+                Ok((index_part, index_generation, _index_mtime)) => {
                     tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)",
                         index_part.layer_metadata.len(), index_part.metadata.disk_consistent_lsn());
                     generation = std::cmp::max(generation, index_generation);
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 5e9702bd3d..450084aca2 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -505,7 +505,7 @@ impl RemoteTimelineClient {
             },
         );
 
-        let (index_part, _index_generation) = download::download_index_part(
+        let (index_part, index_generation, index_last_modified) = download::download_index_part(
             &self.storage_impl,
             &self.tenant_shard_id,
             &self.timeline_id,
@@ -519,6 +519,49 @@ impl RemoteTimelineClient {
         )
         .await?;
 
+        // Defense in depth: monotonicity of generation numbers is an important correctness guarantee, so when we see a very
+        // old index, we do extra checks in case this is the result of backward time-travel of the generation number (e.g.
+        // in case of a bug in the service that issues generation numbers). Indices are allowed to be old, but we expect that
+        // when we load an old index we are loading the _latest_ index: if we are asked to load an old index and there is
+        // also a newer index available, that is surprising.
+        const INDEX_AGE_CHECKS_THRESHOLD: Duration = Duration::from_secs(14 * 24 * 3600);
+        let index_age = index_last_modified.elapsed().unwrap_or_else(|e| {
+            if e.duration() > Duration::from_secs(5) {
+                // We only warn if the S3 clock and our local clock are >5s out: because this is a low resolution
+                // timestamp, it is common to be out by at least 1 second.
+                tracing::warn!("Index has modification time in the future: {e}");
+            }
+            Duration::ZERO
+        });
+        if index_age > INDEX_AGE_CHECKS_THRESHOLD {
+            tracing::info!(
+                ?index_generation,
+                age = index_age.as_secs_f64(),
+                "Loaded an old index, checking for other indices..."
+            );
+
+            // Find the highest-generation index
+            let (_latest_index_part, latest_index_generation, latest_index_mtime) =
+                download::download_index_part(
+                    &self.storage_impl,
+                    &self.tenant_shard_id,
+                    &self.timeline_id,
+                    Generation::MAX,
+                    cancel,
+                )
+                .await?;
+
+            if latest_index_generation > index_generation {
+                // Unexpected!  Why are we loading such an old index if a more recent one exists?
+                tracing::warn!(
+                    ?index_generation,
+                    ?latest_index_generation,
+                    ?latest_index_mtime,
+                    "Found a newer index while loading an old one"
+                );
+            }
+        }
+
         if index_part.deleted_at.is_some() {
             Ok(MaybeDeletedIndexPart::Deleted(index_part))
         } else {
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 692e4d3096..b5d4b0f0bb 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -6,6 +6,7 @@
 use std::collections::HashSet;
 use std::future::Future;
 use std::str::FromStr;
+use std::time::SystemTime;
 
 use anyhow::{anyhow, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -343,10 +344,10 @@ async fn do_download_index_part(
     timeline_id: &TimelineId,
     index_generation: Generation,
     cancel: &CancellationToken,
-) -> Result<(IndexPart, Generation), DownloadError> {
+) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
 
-    let index_part_bytes = download_retry_forever(
+    let (index_part_bytes, index_part_mtime) = download_retry_forever(
         || async {
             let download = storage
                 .download(&remote_path, &DownloadOpts::default(), cancel)
@@ -359,7 +360,7 @@ async fn do_download_index_part(
 
             tokio::io::copy_buf(&mut stream, &mut bytes).await?;
 
-            Ok(bytes)
+            Ok((bytes, download.last_modified))
         },
         &format!("download {remote_path:?}"),
         cancel,
@@ -370,7 +371,7 @@ async fn do_download_index_part(
         .with_context(|| format!("deserialize index part file at {remote_path:?}"))
         .map_err(DownloadError::Other)?;
 
-    Ok((index_part, index_generation))
+    Ok((index_part, index_generation, index_part_mtime))
 }
 
 /// index_part.json objects are suffixed with a generation number, so we cannot
@@ -385,7 +386,7 @@ pub(crate) async fn download_index_part(
     timeline_id: &TimelineId,
     my_generation: Generation,
     cancel: &CancellationToken,
-) -> Result<(IndexPart, Generation), DownloadError> {
+) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
     if my_generation.is_none() {

From 928d98b6dcb57ae22a3da18fc6786b90c8dcae0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 17 Oct 2024 21:25:51 +0200
Subject: [PATCH 12/48] Update Rust to 1.82.0 and mold to 2.34.0 (#9445)

We keep the practice of keeping the compiler up to date, pointing to the
latest release. This is done by many other projects in the Rust
ecosystem as well.

[Release notes](https://github.com/rust-lang/rust/blob/master/RELEASES.md#version-1820-2024-10-17).

Also update mold. [release notes for
2.34.0](https://github.com/rui314/mold/releases/tag/v2.34.0), [release
notes for 2.34.1](https://github.com/rui314/mold/releases/tag/v2.34.1).

Prior update was in #8939.
---
 Dockerfile.build-tools | 6 +++---
 rust-toolchain.toml    | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index 7cba1c8635..f05c60661c 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -72,7 +72,7 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
     && mv s5cmd /usr/local/bin/s5cmd
 
 # LLVM
-ENV LLVM_VERSION=18
+ENV LLVM_VERSION=19
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
     && echo "deb http://apt.llvm.org/${DEBIAN_VERSION}/ llvm-toolchain-${DEBIAN_VERSION}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
     && apt update \
@@ -99,7 +99,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
     && rm awscliv2.zip
 
 # Mold: A Modern Linker
-ENV MOLD_VERSION=v2.33.0
+ENV MOLD_VERSION=v2.34.1
 RUN set -e \
     && git clone https://github.com/rui314/mold.git \
     && mkdir mold/build \
@@ -192,7 +192,7 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.81.0
+ENV RUSTC_VERSION=1.82.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 3c5d0b12a6..92b7929c7f 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.81.0"
+channel = "1.82.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html

From d762ad0883f204dee1b15729db8a6a3d6d5497e5 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 17 Oct 2024 20:45:37 +0100
Subject: [PATCH 13/48] update rustls (#9396)

The forever ongoing effort of juggling multiple versions of rustls :3

now with new crypto library aws-lc.

Because of dependencies, it is currently impossible to not have both
ring and aws-lc in the dep tree, therefore our only options are not
updating rustls or having both crypto backends enabled...

According to benchmarks run by the rustls maintainer, aws-lc is faster
than ring in some cases too <https://jbp.io/graviola/>, so it's not
without its upsides,
---
 Cargo.lock                                    | 220 +++++++++++++-----
 Cargo.toml                                    |  12 +-
 libs/postgres_backend/tests/simple_select.rs  |  29 ++-
 proxy/src/bin/pg_sni_router.rs                |  10 +-
 proxy/src/compute.rs                          |  30 ++-
 proxy/src/config.rs                           |  14 +-
 proxy/src/proxy/tests/mod.rs                  |  51 ++--
 .../src/scan_safekeeper_metadata.rs           |  22 +-
 workspace_hack/Cargo.toml                     |  11 +-
 9 files changed, 276 insertions(+), 123 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 6b212bac2e..ad29fa4634 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -148,9 +148,9 @@ dependencies = [
 
 [[package]]
 name = "asn1-rs"
-version = "0.5.2"
+version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f6fd5ddaf0351dff5b8da21b2fb4ff8e08ddd02857f0bf69c47639106c0fff0"
+checksum = "5493c3bedbacf7fd7382c6346bbd66687d12bbaad3a89a2d2c303ee6cf20b048"
 dependencies = [
  "asn1-rs-derive",
  "asn1-rs-impl",
@@ -164,25 +164,25 @@ dependencies = [
 
 [[package]]
 name = "asn1-rs-derive"
-version = "0.4.0"
+version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "726535892e8eae7e70657b4c8ea93d26b8553afb1ce617caee529ef96d7dee6c"
+checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.52",
  "synstructure",
 ]
 
 [[package]]
 name = "asn1-rs-impl"
-version = "0.1.0"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2777730b2039ac0f95f093556e61b6d26cebed5393ca6f152717777cec3a42ed"
+checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -310,6 +310,33 @@ dependencies = [
  "zeroize",
 ]
 
+[[package]]
+name = "aws-lc-rs"
+version = "1.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2f95446d919226d587817a7d21379e6eb099b97b45110a7f272a444ca5c54070"
+dependencies = [
+ "aws-lc-sys",
+ "mirai-annotations",
+ "paste",
+ "zeroize",
+]
+
+[[package]]
+name = "aws-lc-sys"
+version = "0.21.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b3ddc4a5b231dd6958b140ff3151b6412b3f4321fab354f399eec8f14b06df62"
+dependencies = [
+ "bindgen 0.69.5",
+ "cc",
+ "cmake",
+ "dunce",
+ "fs_extra",
+ "libc",
+ "paste",
+]
+
 [[package]]
 name = "aws-runtime"
 version = "1.4.3"
@@ -595,7 +622,7 @@ dependencies = [
  "once_cell",
  "pin-project-lite",
  "pin-utils",
- "rustls 0.21.11",
+ "rustls 0.21.12",
  "tokio",
  "tracing",
 ]
@@ -915,6 +942,29 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "bindgen"
+version = "0.69.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
+dependencies = [
+ "bitflags 2.4.1",
+ "cexpr",
+ "clang-sys",
+ "itertools 0.10.5",
+ "lazy_static",
+ "lazycell",
+ "log",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash",
+ "shlex",
+ "syn 2.0.52",
+ "which",
+]
+
 [[package]]
 name = "bindgen"
 version = "0.70.1"
@@ -924,7 +974,7 @@ dependencies = [
  "bitflags 2.4.1",
  "cexpr",
  "clang-sys",
- "itertools 0.12.1",
+ "itertools 0.10.5",
  "log",
  "prettyplease",
  "proc-macro2",
@@ -1038,12 +1088,13 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
 
 [[package]]
 name = "cc"
-version = "1.0.83"
+version = "1.1.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0"
+checksum = "b16803a61b81d9eabb7eae2588776c4c1e584b738ede45fdbb4c972cec1e9945"
 dependencies = [
  "jobserver",
  "libc",
+ "shlex",
 ]
 
 [[package]]
@@ -1169,6 +1220,15 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
 
+[[package]]
+name = "cmake"
+version = "0.1.51"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "colorchoice"
 version = "1.0.0"
@@ -1624,9 +1684,9 @@ dependencies = [
 
 [[package]]
 name = "der-parser"
-version = "8.2.0"
+version = "9.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dbd676fbbab537128ef0278adb5576cf363cff6aa22a7b24effe97347cfab61e"
+checksum = "5cd0a5c643689626bec213c4d8bd4d96acc8ffdb4ad4bb6bc16abf27d5f4b553"
 dependencies = [
  "asn1-rs",
  "displaydoc",
@@ -1755,6 +1815,12 @@ dependencies = [
  "syn 2.0.52",
 ]
 
+[[package]]
+name = "dunce"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
+
 [[package]]
 name = "dyn-clone"
 version = "1.0.14"
@@ -2059,6 +2125,12 @@ dependencies = [
  "tokio-util",
 ]
 
+[[package]]
+name = "fs_extra"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
+
 [[package]]
 name = "fsevent-sys"
 version = "4.1.0"
@@ -2412,6 +2484,15 @@ dependencies = [
  "digest",
 ]
 
+[[package]]
+name = "home"
+version = "0.5.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5"
+dependencies = [
+ "windows-sys 0.52.0",
+]
+
 [[package]]
 name = "hostname"
 version = "0.4.0"
@@ -2581,7 +2662,7 @@ dependencies = [
  "http 0.2.9",
  "hyper 0.14.30",
  "log",
- "rustls 0.21.11",
+ "rustls 0.21.12",
  "rustls-native-certs 0.6.2",
  "tokio",
  "tokio-rustls 0.24.0",
@@ -2801,9 +2882,9 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
 
 [[package]]
 name = "jobserver"
-version = "0.1.26"
+version = "0.1.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
+checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0"
 dependencies = [
  "libc",
 ]
@@ -2907,6 +2988,12 @@ dependencies = [
  "spin",
 ]
 
+[[package]]
+name = "lazycell"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
+
 [[package]]
 name = "libc"
 version = "0.2.150"
@@ -3137,6 +3224,12 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "mirai-annotations"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c9be0862c1b3f26a88803c4a49de6889c10e608b3ee9344e6ef5b45fb37ad3d1"
+
 [[package]]
 name = "multimap"
 version = "0.8.3"
@@ -3356,9 +3449,9 @@ dependencies = [
 
 [[package]]
 name = "oid-registry"
-version = "0.6.1"
+version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9bedf36ffb6ba96c2eb7144ef6270557b52e54b20c0a8e1eb2ff99a6c6959bff"
+checksum = "a8d8034d9489cdaf79228eb9f6a3b8d7bb32ba00d6645ebd48eef4077ceb5bd9"
 dependencies = [
  "asn1-rs",
 ]
@@ -4053,14 +4146,14 @@ dependencies = [
  "bytes",
  "once_cell",
  "pq_proto",
- "rustls 0.22.4",
+ "rustls 0.23.7",
  "rustls-pemfile 2.1.1",
  "serde",
  "thiserror",
  "tokio",
  "tokio-postgres",
  "tokio-postgres-rustls",
- "tokio-rustls 0.25.0",
+ "tokio-rustls 0.26.0",
  "tokio-util",
  "tracing",
 ]
@@ -4082,7 +4175,7 @@ name = "postgres_ffi"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "bindgen",
+ "bindgen 0.70.1",
  "bytes",
  "crc32c",
  "env_logger",
@@ -4219,7 +4312,7 @@ checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
 dependencies = [
  "bytes",
  "heck 0.5.0",
- "itertools 0.12.1",
+ "itertools 0.10.5",
  "log",
  "multimap",
  "once_cell",
@@ -4239,7 +4332,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5"
 dependencies = [
  "anyhow",
- "itertools 0.12.1",
+ "itertools 0.10.5",
  "proc-macro2",
  "quote",
  "syn 2.0.52",
@@ -4327,8 +4420,8 @@ dependencies = [
  "rsa",
  "rstest",
  "rustc-hash",
- "rustls 0.22.4",
- "rustls-native-certs 0.7.0",
+ "rustls 0.23.7",
+ "rustls-native-certs 0.8.0",
  "rustls-pemfile 2.1.1",
  "scopeguard",
  "serde",
@@ -4345,7 +4438,7 @@ dependencies = [
  "tokio",
  "tokio-postgres",
  "tokio-postgres-rustls",
- "tokio-rustls 0.25.0",
+ "tokio-rustls 0.26.0",
  "tokio-tungstenite",
  "tokio-util",
  "tracing",
@@ -4509,12 +4602,13 @@ dependencies = [
 
 [[package]]
 name = "rcgen"
-version = "0.12.1"
+version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48406db8ac1f3cbc7dcdb56ec355343817958a356ff430259bb07baf7607e1e1"
+checksum = "54077e1872c46788540de1ea3d7f4ccb1983d12f9aa909b234468676c1a36779"
 dependencies = [
  "pem",
  "ring",
+ "rustls-pki-types",
  "time",
  "yasna",
 ]
@@ -4693,7 +4787,7 @@ dependencies = [
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
- "rustls 0.21.11",
+ "rustls 0.21.12",
  "rustls-pemfile 1.0.2",
  "serde",
  "serde_json",
@@ -4991,9 +5085,9 @@ dependencies = [
 
 [[package]]
 name = "rustls"
-version = "0.21.11"
+version = "0.21.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4"
+checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e"
 dependencies = [
  "log",
  "ring",
@@ -5021,6 +5115,7 @@ version = "0.23.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ebbbdb961df0ad3f2652da8f3fdc4b36122f568f968f45ad3316f26c025c677b"
 dependencies = [
+ "aws-lc-rs",
  "log",
  "once_cell",
  "ring",
@@ -5089,9 +5184,9 @@ dependencies = [
 
 [[package]]
 name = "rustls-pki-types"
-version = "1.3.1"
+version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8"
+checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b"
 
 [[package]]
 name = "rustls-webpki"
@@ -5109,6 +5204,7 @@ version = "0.102.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610"
 dependencies = [
+ "aws-lc-rs",
  "ring",
  "rustls-pki-types",
  "untrusted",
@@ -5312,7 +5408,7 @@ checksum = "00421ed8fa0c995f07cde48ba6c89e80f2b312f74ff637326f392fbfd23abe02"
 dependencies = [
  "httpdate",
  "reqwest 0.12.4",
- "rustls 0.21.11",
+ "rustls 0.21.12",
  "sentry-backtrace",
  "sentry-contexts",
  "sentry-core",
@@ -5807,8 +5903,8 @@ dependencies = [
  "postgres_ffi",
  "remote_storage",
  "reqwest 0.12.4",
- "rustls 0.22.4",
- "rustls-native-certs 0.7.0",
+ "rustls 0.23.7",
+ "rustls-native-certs 0.8.0",
  "serde",
  "serde_json",
  "storage_controller_client",
@@ -5930,14 +6026,13 @@ checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394"
 
 [[package]]
 name = "synstructure"
-version = "0.12.6"
+version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f"
+checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 1.0.109",
- "unicode-xid",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -6236,16 +6331,15 @@ dependencies = [
 
 [[package]]
 name = "tokio-postgres-rustls"
-version = "0.11.1"
+version = "0.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
+checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab"
 dependencies = [
- "futures",
  "ring",
- "rustls 0.22.4",
+ "rustls 0.23.7",
  "tokio",
  "tokio-postgres",
- "tokio-rustls 0.25.0",
+ "tokio-rustls 0.26.0",
  "x509-certificate",
 ]
 
@@ -6255,7 +6349,7 @@ version = "0.24.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5"
 dependencies = [
- "rustls 0.21.11",
+ "rustls 0.21.12",
  "tokio",
 ]
 
@@ -6678,16 +6772,15 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
 
 [[package]]
 name = "ureq"
-version = "2.9.7"
+version = "2.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d11a831e3c0b56e438a28308e7c810799e3c118417f342d30ecec080105395cd"
+checksum = "b74fc6b57825be3373f7054754755f03ac3a8f5d70015ccad699ba2029956f4a"
 dependencies = [
  "base64 0.22.1",
  "log",
  "once_cell",
- "rustls 0.22.4",
+ "rustls 0.23.7",
  "rustls-pki-types",
- "rustls-webpki 0.102.2",
  "url",
  "webpki-roots 0.26.1",
 ]
@@ -6876,7 +6969,7 @@ name = "walproposer"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "bindgen",
+ "bindgen 0.70.1",
  "postgres_ffi",
  "utils",
 ]
@@ -7051,6 +7144,18 @@ dependencies = [
  "rustls-pki-types",
 ]
 
+[[package]]
+name = "which"
+version = "4.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
+dependencies = [
+ "either",
+ "home",
+ "once_cell",
+ "rustix",
+]
+
 [[package]]
 name = "whoami"
 version = "1.5.1"
@@ -7295,7 +7400,6 @@ dependencies = [
  "digest",
  "either",
  "fail",
- "futures",
  "futures-channel",
  "futures-executor",
  "futures-io",
@@ -7311,7 +7415,7 @@ dependencies = [
  "hyper-util",
  "indexmap 1.9.3",
  "indexmap 2.0.1",
- "itertools 0.12.1",
+ "itertools 0.10.5",
  "lazy_static",
  "libc",
  "log",
@@ -7332,6 +7436,8 @@ dependencies = [
  "regex-automata 0.4.3",
  "regex-syntax 0.8.2",
  "reqwest 0.12.4",
+ "rustls 0.23.7",
+ "rustls-webpki 0.102.2",
  "scopeguard",
  "serde",
  "serde_json",
@@ -7340,7 +7446,6 @@ dependencies = [
  "smallvec",
  "spki 0.7.3",
  "subtle",
- "syn 1.0.109",
  "syn 2.0.52",
  "sync_wrapper 0.1.2",
  "tikv-jemalloc-sys",
@@ -7348,6 +7453,7 @@ dependencies = [
  "time-macros",
  "tokio",
  "tokio-postgres",
+ "tokio-rustls 0.26.0",
  "tokio-stream",
  "tokio-util",
  "toml_edit",
@@ -7383,9 +7489,9 @@ dependencies = [
 
 [[package]]
 name = "x509-parser"
-version = "0.15.0"
+version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bab0c2f54ae1d92f4fcb99c0b7ccf0b1e3451cbd395e5f115ccbdbcb18d4f634"
+checksum = "fcbc162f30700d6f3f82a24bf7cc62ffe7caea42c0b2cba8bf7f3ae50cf51f69"
 dependencies = [
  "asn1-rs",
  "data-encoding",
diff --git a/Cargo.toml b/Cargo.toml
index a1a974b33b..4c6a24ecde 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -142,7 +142,7 @@ reqwest-retry = "0.5"
 routerify = "3"
 rpds = "0.13"
 rustc-hash = "1.1.0"
-rustls = "0.22"
+rustls = "0.23"
 rustls-pemfile = "2"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
@@ -172,8 +172,8 @@ tikv-jemalloc-ctl = "0.5"
 tokio = { version = "1.17", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
-tokio-postgres-rustls = "0.11.0"
-tokio-rustls = "0.25"
+tokio-postgres-rustls = "0.12.0"
+tokio-rustls = "0.26"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
@@ -192,8 +192,8 @@ url = "2.2"
 urlencoding = "2.1"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
-rustls-native-certs = "0.7"
-x509-parser = "0.15"
+rustls-native-certs = "0.8"
+x509-parser = "0.16"
 whoami = "1.5.1"
 
 ## TODO replace this with tracing
@@ -244,7 +244,7 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" }
 
 ## Build dependencies
 criterion = "0.5.1"
-rcgen = "0.12"
+rcgen = "0.13"
 rstest = "0.18"
 camino-tempfile = "1.0.2"
 tonic-build = "0.12"
diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs
index 900083ea7f..9d3031d699 100644
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -2,6 +2,7 @@
 use once_cell::sync::Lazy;
 use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
 use pq_proto::{BeMessage, RowDescriptor};
+use rustls::crypto::aws_lc_rs;
 use std::io::Cursor;
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -92,10 +93,13 @@ static CERT: Lazy<rustls::pki_types::CertificateDer<'static>> = Lazy::new(|| {
 async fn simple_select_ssl() {
     let (client_sock, server_sock) = make_tcp_pair().await;
 
-    let server_cfg = rustls::ServerConfig::builder()
-        .with_no_client_auth()
-        .with_single_cert(vec![CERT.clone()], KEY.clone_key())
-        .unwrap();
+    let server_cfg =
+        rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+            .with_safe_default_protocol_versions()
+            .expect("aws_lc_rs should support the default protocol versions")
+            .with_no_client_auth()
+            .with_single_cert(vec![CERT.clone()], KEY.clone_key())
+            .unwrap();
     let tls_config = Some(Arc::new(server_cfg));
     let pgbackend =
         PostgresBackend::new(server_sock, AuthType::Trust, tls_config).expect("pgbackend creation");
@@ -105,13 +109,16 @@ async fn simple_select_ssl() {
         pgbackend.run(&mut handler, &CancellationToken::new()).await
     });
 
-    let client_cfg = rustls::ClientConfig::builder()
-        .with_root_certificates({
-            let mut store = rustls::RootCertStore::empty();
-            store.add(CERT.clone()).unwrap();
-            store
-        })
-        .with_no_client_auth();
+    let client_cfg =
+        rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+            .with_safe_default_protocol_versions()
+            .expect("aws_lc_rs should support the default protocol versions")
+            .with_root_certificates({
+                let mut store = rustls::RootCertStore::empty();
+                store.add(CERT.clone()).unwrap();
+                store
+            })
+            .with_no_client_auth();
     let mut make_tls_connect = tokio_postgres_rustls::MakeRustlsConnect::new(client_cfg);
     let tls_connect = <MakeRustlsConnect as MakeTlsConnect<TcpStream>>::make_tls_connect(
         &mut make_tls_connect,
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 00eb830d98..13b7fdd40a 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -15,6 +15,7 @@ use proxy::context::RequestMonitoring;
 use proxy::metrics::{Metrics, ThreadPoolMetrics};
 use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
 use proxy::stream::{PqStream, Stream};
+use rustls::crypto::aws_lc_rs;
 use rustls::pki_types::PrivateKeyDer;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::TcpListener;
@@ -104,10 +105,11 @@ async fn main() -> anyhow::Result<()> {
             let first_cert = cert_chain.first().context("missing certificate")?;
             let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
 
-            let tls_config = rustls::ServerConfig::builder_with_protocol_versions(&[
-                &rustls::version::TLS13,
-                &rustls::version::TLS12,
-            ])
+            let tls_config = rustls::ServerConfig::builder_with_provider(Arc::new(
+                aws_lc_rs::default_provider(),
+            ))
+            .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])
+            .context("aws_lc_rs should support TLS1.2 and TLS1.3")?
             .with_no_client_auth()
             .with_single_cert(cert_chain, key)?
             .into();
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 212e82497f..a7c2cab4a1 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -8,6 +8,7 @@ use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pq_proto::StartupMessageParams;
 use rustls::client::danger::ServerCertVerifier;
+use rustls::crypto::aws_lc_rs;
 use rustls::pki_types::InvalidDnsNameError;
 use thiserror::Error;
 use tokio::net::TcpStream;
@@ -38,6 +39,9 @@ pub(crate) enum ConnectionError {
     #[error("{COULD_NOT_CONNECT}: {0}")]
     CouldNotConnect(#[from] io::Error),
 
+    #[error("Couldn't load native TLS certificates: {0:?}")]
+    TlsCertificateError(Vec<rustls_native_certs::Error>),
+
     #[error("{COULD_NOT_CONNECT}: {0}")]
     TlsError(#[from] InvalidDnsNameError),
 
@@ -84,6 +88,7 @@ impl ReportableError for ConnectionError {
             }
             ConnectionError::Postgres(_) => crate::error::ErrorKind::Compute,
             ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute,
+            ConnectionError::TlsCertificateError(_) => crate::error::ErrorKind::Service,
             ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute,
             ConnectionError::WakeComputeError(e) => e.get_error_kind(),
             ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(),
@@ -293,12 +298,20 @@ impl ConnCfg {
         let client_config = if allow_self_signed_compute {
             // Allow all certificates for creating the connection
             let verifier = Arc::new(AcceptEverythingVerifier);
-            rustls::ClientConfig::builder()
+            rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+                .with_safe_default_protocol_versions()
+                .expect("aws_lc_rs should support the default protocol versions")
                 .dangerous()
                 .with_custom_certificate_verifier(verifier)
         } else {
-            let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone();
-            rustls::ClientConfig::builder().with_root_certificates(root_store)
+            let root_store = TLS_ROOTS
+                .get_or_try_init(load_certs)
+                .map_err(ConnectionError::TlsCertificateError)?
+                .clone();
+            rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+                .with_safe_default_protocol_versions()
+                .expect("aws_lc_rs should support the default protocol versions")
+                .with_root_certificates(root_store)
         };
         let client_config = client_config.with_no_client_auth();
 
@@ -359,10 +372,15 @@ fn filtered_options(params: &StartupMessageParams) -> Option<String> {
     Some(options)
 }
 
-fn load_certs() -> Result<Arc<rustls::RootCertStore>, io::Error> {
-    let der_certs = rustls_native_certs::load_native_certs()?;
+fn load_certs() -> Result<Arc<rustls::RootCertStore>, Vec<rustls_native_certs::Error>> {
+    let der_certs = rustls_native_certs::load_native_certs();
+
+    if !der_certs.errors.is_empty() {
+        return Err(der_certs.errors);
+    }
+
     let mut store = rustls::RootCertStore::empty();
-    store.add_parsable_certificates(der_certs);
+    store.add_parsable_certificates(der_certs.certs);
     Ok(Arc::new(store))
 }
 static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 2ec8c7adda..0d5ebd88f9 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -7,7 +7,7 @@ use anyhow::{bail, ensure, Context, Ok};
 use clap::ValueEnum;
 use itertools::Itertools;
 use remote_storage::RemoteStorageConfig;
-use rustls::crypto::ring::sign;
+use rustls::crypto::aws_lc_rs::{self, sign};
 use rustls::pki_types::{CertificateDer, PrivateKeyDer};
 use sha2::{Digest, Sha256};
 use tracing::{error, info};
@@ -126,12 +126,12 @@ pub fn configure_tls(
     let cert_resolver = Arc::new(cert_resolver);
 
     // allow TLS 1.2 to be compatible with older client libraries
-    let mut config = rustls::ServerConfig::builder_with_protocol_versions(&[
-        &rustls::version::TLS13,
-        &rustls::version::TLS12,
-    ])
-    .with_no_client_auth()
-    .with_cert_resolver(cert_resolver.clone());
+    let mut config =
+        rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+            .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])
+            .context("aws_lc_rs should support TLS1.2 and TLS1.3")?
+            .with_no_client_auth()
+            .with_cert_resolver(cert_resolver.clone());
 
     config.alpn_protocols = vec![PG_ALPN_PROTOCOL.to_vec()];
 
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index e50ae4bc93..88175d73b1 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -9,6 +9,7 @@ use async_trait::async_trait;
 use http::StatusCode;
 use retry::{retry_after, ShouldRetryWakeCompute};
 use rstest::rstest;
+use rustls::crypto::aws_lc_rs;
 use rustls::pki_types;
 use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::{MakeTlsConnect, NoTls};
@@ -38,25 +39,27 @@ fn generate_certs(
     pki_types::CertificateDer<'static>,
     pki_types::PrivateKeyDer<'static>,
 )> {
-    let ca = rcgen::Certificate::from_params({
+    let ca_key = rcgen::KeyPair::generate()?;
+    let ca = {
         let mut params = rcgen::CertificateParams::default();
         params.is_ca = rcgen::IsCa::Ca(rcgen::BasicConstraints::Unconstrained);
-        params
-    })?;
+        params.self_signed(&ca_key)?
+    };
 
-    let cert = rcgen::Certificate::from_params({
-        let mut params = rcgen::CertificateParams::new(vec![hostname.into()]);
+    let cert_key = rcgen::KeyPair::generate()?;
+    let cert = {
+        let mut params = rcgen::CertificateParams::new(vec![hostname.into()])?;
         params.distinguished_name = rcgen::DistinguishedName::new();
         params
             .distinguished_name
             .push(rcgen::DnType::CommonName, common_name);
-        params
-    })?;
+        params.signed_by(&cert_key, &ca, &ca_key)?
+    };
 
     Ok((
-        pki_types::CertificateDer::from(ca.serialize_der()?),
-        pki_types::CertificateDer::from(cert.serialize_der_with_signer(&ca)?),
-        pki_types::PrivateKeyDer::Pkcs8(cert.serialize_private_key_der().into()),
+        ca.der().clone(),
+        cert.der().clone(),
+        pki_types::PrivateKeyDer::Pkcs8(cert_key.serialize_der().into()),
     ))
 }
 
@@ -90,10 +93,13 @@ fn generate_tls_config<'a>(
     let (ca, cert, key) = generate_certs(hostname, common_name)?;
 
     let tls_config = {
-        let config = rustls::ServerConfig::builder()
-            .with_no_client_auth()
-            .with_single_cert(vec![cert.clone()], key.clone_key())?
-            .into();
+        let config =
+            rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+                .with_safe_default_protocol_versions()
+                .context("aws_lc_rs should support the default protocol versions")?
+                .with_no_client_auth()
+                .with_single_cert(vec![cert.clone()], key.clone_key())?
+                .into();
 
         let mut cert_resolver = CertResolver::new();
         cert_resolver.add_cert(key, vec![cert], true)?;
@@ -108,13 +114,16 @@ fn generate_tls_config<'a>(
     };
 
     let client_config = {
-        let config = rustls::ClientConfig::builder()
-            .with_root_certificates({
-                let mut store = rustls::RootCertStore::empty();
-                store.add(ca)?;
-                store
-            })
-            .with_no_client_auth();
+        let config =
+            rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+                .with_safe_default_protocol_versions()
+                .context("aws_lc_rs should support the default protocol versions")?
+                .with_root_certificates({
+                    let mut store = rustls::RootCertStore::empty();
+                    store.add(ca)?;
+                    store
+                })
+                .with_no_client_auth();
 
         ClientConfig { config, hostname }
     };
diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs
index 15f3665fac..6c312d0036 100644
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
@@ -1,10 +1,12 @@
 use std::{collections::HashSet, str::FromStr, sync::Arc};
 
+use anyhow::{bail, Context};
 use futures::stream::{StreamExt, TryStreamExt};
 use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
 use postgres_ffi::{XLogFileName, PG_TLI};
 use remote_storage::GenericRemoteStorage;
+use rustls::crypto::aws_lc_rs;
 use serde::Serialize;
 use tokio_postgres::types::PgLsn;
 use tracing::{debug, error, info};
@@ -231,10 +233,15 @@ async fn check_timeline(
     })
 }
 
-fn load_certs() -> Result<Arc<rustls::RootCertStore>, std::io::Error> {
-    let der_certs = rustls_native_certs::load_native_certs()?;
+fn load_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
+    let der_certs = rustls_native_certs::load_native_certs();
+
+    if !der_certs.errors.is_empty() {
+        bail!("could not load native tls certs: {:?}", der_certs.errors);
+    }
+
     let mut store = rustls::RootCertStore::empty();
-    store.add_parsable_certificates(der_certs);
+    store.add_parsable_certificates(der_certs.certs);
     Ok(Arc::new(store))
 }
 static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
@@ -248,9 +255,12 @@ async fn load_timelines_from_db(
 
     // Use rustls (Neon requires TLS)
     let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone();
-    let client_config = rustls::ClientConfig::builder()
-        .with_root_certificates(root_store)
-        .with_no_client_auth();
+    let client_config =
+        rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+            .with_safe_default_protocol_versions()
+            .context("aws_lc_rs should support the default protocol versions")?
+            .with_root_certificates(root_store)
+            .with_no_client_auth();
     let tls_connector = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
     let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?;
     // The connection object performs the actual communication with the database,
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 1347d6ddff..28c51b8ac1 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -32,7 +32,6 @@ deranged = { version = "0.3", default-features = false, features = ["powerfmt",
 digest = { version = "0.10", features = ["mac", "oid", "std"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
-futures = { version = "0.3" }
 futures-channel = { version = "0.3", features = ["sink"] }
 futures-executor = { version = "0.3" }
 futures-io = { version = "0.3" }
@@ -48,7 +47,7 @@ hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"]
 hyper-util = { version = "0.1", features = ["client-legacy", "server-auto", "service"] }
 indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] }
 indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] }
-itertools = { version = "0.12" }
+itertools = { version = "0.10" }
 lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
@@ -66,6 +65,8 @@ regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
 regex-syntax = { version = "0.8" }
 reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "stream"] }
+rustls = { version = "0.23", features = ["ring"] }
+rustls-webpki = { version = "0.102", default-features = false, features = ["aws_lc_rs", "ring", "std"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["alloc", "raw_value"] }
@@ -79,6 +80,7 @@ tikv-jemalloc-sys = { version = "0.5" }
 time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2", features = ["with-serde_json-1"] }
+tokio-rustls = { version = "0.26", features = ["ring"] }
 tokio-stream = { version = "0.1", features = ["net"] }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] }
 toml_edit = { version = "0.22", features = ["serde"] }
@@ -104,7 +106,7 @@ half = { version = "2", default-features = false, features = ["num-traits"] }
 hashbrown = { version = "0.14", features = ["raw"] }
 indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] }
 indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] }
-itertools = { version = "0.12" }
+itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
@@ -122,8 +124,7 @@ regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
 regex-syntax = { version = "0.8" }
 serde = { version = "1", features = ["alloc", "derive"] }
-syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] }
-syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
+syn = { version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
 time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] }
 toml_edit = { version = "0.22", features = ["serde"] }
 zstd = { version = "0.13" }

From b8304f90d6ad9a5f118a59ac392b3330495827d3 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 18 Oct 2024 10:27:50 +0100
Subject: [PATCH 14/48] 2024 oct new clippy lints (#9448)

Fixes new lints from `cargo +nightly clippy` (`clippy 0.1.83 (798fb83f
2024-10-16)`)
---
 compute_tools/src/extension_server.rs         |  2 +-
 .../pageserver_api/src/models/partitioning.rs |  6 ++--
 libs/postgres_backend/src/lib.rs              |  3 +-
 libs/pq_proto/src/lib.rs                      |  2 +-
 libs/tenant_size_model/src/svg.rs             |  2 +-
 libs/tracing-utils/src/http.rs                |  2 +-
 libs/utils/src/lsn.rs                         |  2 +-
 libs/utils/src/poison.rs                      |  4 +--
 libs/utils/src/shard.rs                       |  2 +-
 libs/utils/src/simple_rcu.rs                  |  4 +--
 libs/utils/src/sync/heavier_once_cell.rs      |  4 +--
 libs/utils/src/tracing_span_assert.rs         | 10 +++----
 pageserver/compaction/src/helpers.rs          | 10 +++----
 pageserver/src/consumption_metrics/upload.rs  |  2 +-
 pageserver/src/disk_usage_eviction_task.rs    |  2 +-
 pageserver/src/metrics.rs                     |  4 +--
 pageserver/src/statvfs.rs                     |  2 +-
 pageserver/src/tenant/block_io.rs             |  4 +--
 pageserver/src/tenant/disk_btree.rs           |  2 +-
 .../src/tenant/remote_timeline_client.rs      |  2 +-
 .../src/tenant/secondary/heatmap_uploader.rs  |  1 -
 pageserver/src/tenant/storage_layer.rs        |  2 +-
 .../src/tenant/storage_layer/delta_layer.rs   |  3 +-
 .../src/tenant/storage_layer/image_layer.rs   |  3 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  2 +-
 .../src/tenant/storage_layer/layer_name.rs    |  2 +-
 .../tenant/storage_layer/merge_iterator.rs    |  8 +++---
 pageserver/src/tenant/vectored_blob_io.rs     | 21 +++-----------
 pageserver/src/virtual_file.rs                |  4 +--
 proxy/src/auth/credentials.rs                 |  2 +-
 proxy/src/config.rs                           |  2 +-
 proxy/src/context/parquet.rs                  |  2 +-
 proxy/src/intern.rs                           |  2 +-
 proxy/src/lib.rs                              |  6 +---
 proxy/src/proxy/tests/mod.rs                  | 10 +++----
 proxy/src/scram/exchange.rs                   |  4 ---
 proxy/src/serverless/conn_pool.rs             | 12 ++++----
 proxy/src/serverless/conn_pool_lib.rs         | 28 +++++++++----------
 proxy/src/serverless/http_conn_pool.rs        |  3 +-
 proxy/src/serverless/json.rs                  |  6 ++--
 proxy/src/serverless/local_conn_pool.rs       |  3 +-
 proxy/src/serverless/sql_over_http.rs         |  1 -
 proxy/src/usage_metrics.rs                    | 10 +++----
 proxy/src/waiters.rs                          |  2 +-
 safekeeper/src/timeline.rs                    |  6 ++--
 45 files changed, 92 insertions(+), 124 deletions(-)

diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index 6ef7e0837f..da2d107b54 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -107,7 +107,7 @@ pub fn get_pg_version(pgbin: &str) -> String {
     // pg_config --version returns a (platform specific) human readable string
     // such as "PostgreSQL 15.4". We parse this to v14/v15/v16 etc.
     let human_version = get_pg_config("--version", pgbin);
-    return parse_pg_version(&human_version).to_string();
+    parse_pg_version(&human_version).to_string()
 }
 
 fn parse_pg_version(human_version: &str) -> &str {
diff --git a/libs/pageserver_api/src/models/partitioning.rs b/libs/pageserver_api/src/models/partitioning.rs
index f6644be635..69832b9a0d 100644
--- a/libs/pageserver_api/src/models/partitioning.rs
+++ b/libs/pageserver_api/src/models/partitioning.rs
@@ -16,7 +16,7 @@ impl serde::Serialize for Partitioning {
     {
         pub struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
 
-        impl<'a> serde::Serialize for KeySpace<'a> {
+        impl serde::Serialize for KeySpace<'_> {
             fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
             where
                 S: serde::Serializer,
@@ -44,7 +44,7 @@ impl serde::Serialize for Partitioning {
 
 pub struct WithDisplay<'a, T>(&'a T);
 
-impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
+impl<T: std::fmt::Display> serde::Serialize for WithDisplay<'_, T> {
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
     where
         S: serde::Serializer,
@@ -55,7 +55,7 @@ impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
 
 pub struct KeyRange<'a>(&'a std::ops::Range<crate::key::Key>);
 
-impl<'a> serde::Serialize for KeyRange<'a> {
+impl serde::Serialize for KeyRange<'_> {
     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
     where
         S: serde::Serializer,
diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 085540e7b9..9d274b25e6 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -921,12 +921,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackendReader<IO> {
 /// A futures::AsyncWrite implementation that wraps all data written to it in CopyData
 /// messages.
 ///
-
 pub struct CopyDataWriter<'a, IO> {
     pgb: &'a mut PostgresBackend<IO>,
 }
 
-impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, IO> {
+impl<IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'_, IO> {
     fn poll_write(
         self: Pin<&mut Self>,
         cx: &mut std::task::Context<'_>,
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index a01191bd5d..9ffaaba584 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -727,7 +727,7 @@ pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000";
 pub const SQLSTATE_ADMIN_SHUTDOWN: &[u8; 5] = b"57P01";
 pub const SQLSTATE_SUCCESSFUL_COMPLETION: &[u8; 5] = b"00000";
 
-impl<'a> BeMessage<'a> {
+impl BeMessage<'_> {
     /// Serialize `message` to the given `buf`.
     /// Apart from smart memory managemet, BytesMut is good here as msg len
     /// precedes its body and it is handy to write it down first and then fill
diff --git a/libs/tenant_size_model/src/svg.rs b/libs/tenant_size_model/src/svg.rs
index 0de2890bb4..25ebb1c3d8 100644
--- a/libs/tenant_size_model/src/svg.rs
+++ b/libs/tenant_size_model/src/svg.rs
@@ -97,7 +97,7 @@ pub fn draw_svg(
     Ok(result)
 }
 
-impl<'a> SvgDraw<'a> {
+impl SvgDraw<'_> {
     fn calculate_svg_layout(&mut self) {
         // Find x scale
         let segments = &self.storage.segments;
diff --git a/libs/tracing-utils/src/http.rs b/libs/tracing-utils/src/http.rs
index e6fdf9be45..2168beee88 100644
--- a/libs/tracing-utils/src/http.rs
+++ b/libs/tracing-utils/src/http.rs
@@ -82,7 +82,7 @@ where
 fn extract_remote_context(headers: &HeaderMap) -> opentelemetry::Context {
     struct HeaderExtractor<'a>(&'a HeaderMap);
 
-    impl<'a> opentelemetry::propagation::Extractor for HeaderExtractor<'a> {
+    impl opentelemetry::propagation::Extractor for HeaderExtractor<'_> {
         fn get(&self, key: &str) -> Option<&str> {
             self.0.get(key).and_then(|value| value.to_str().ok())
         }
diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs
index 06d5c27ebf..3ec2c130bd 100644
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -37,7 +37,7 @@ impl<'de> Deserialize<'de> for Lsn {
             is_human_readable_deserializer: bool,
         }
 
-        impl<'de> Visitor<'de> for LsnVisitor {
+        impl Visitor<'_> for LsnVisitor {
             type Value = Lsn;
 
             fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs
index c3e2fba20c..ab9ebb3c5a 100644
--- a/libs/utils/src/poison.rs
+++ b/libs/utils/src/poison.rs
@@ -73,7 +73,7 @@ impl<T> Poison<T> {
 /// and subsequent calls to [`Poison::check_and_arm`] will fail with an error.
 pub struct Guard<'a, T>(&'a mut Poison<T>);
 
-impl<'a, T> Guard<'a, T> {
+impl<T> Guard<'_, T> {
     pub fn data(&self) -> &T {
         &self.0.data
     }
@@ -94,7 +94,7 @@ impl<'a, T> Guard<'a, T> {
     }
 }
 
-impl<'a, T> Drop for Guard<'a, T> {
+impl<T> Drop for Guard<'_, T> {
     fn drop(&mut self) {
         match self.0.state {
             State::Clean => {
diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs
index d146010b41..782cddc599 100644
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -164,7 +164,7 @@ impl TenantShardId {
     }
 }
 
-impl<'a> std::fmt::Display for ShardSlug<'a> {
+impl std::fmt::Display for ShardSlug<'_> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(
             f,
diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs
index 01750b2aef..6700f86e4a 100644
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -152,7 +152,7 @@ pub struct RcuWriteGuard<'a, V> {
     inner: RwLockWriteGuard<'a, RcuInner<V>>,
 }
 
-impl<'a, V> Deref for RcuWriteGuard<'a, V> {
+impl<V> Deref for RcuWriteGuard<'_, V> {
     type Target = V;
 
     fn deref(&self) -> &V {
@@ -160,7 +160,7 @@ impl<'a, V> Deref for RcuWriteGuard<'a, V> {
     }
 }
 
-impl<'a, V> RcuWriteGuard<'a, V> {
+impl<V> RcuWriteGuard<'_, V> {
     ///
     /// Store a new value. The new value will be written to the Rcu immediately,
     /// and will be immediately seen by any `read` calls that start afterwards.
diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index dc711fb028..66c2065554 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -219,7 +219,7 @@ impl<'a, T> CountWaitingInitializers<'a, T> {
     }
 }
 
-impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
+impl<T> Drop for CountWaitingInitializers<'_, T> {
     fn drop(&mut self) {
         self.0.initializers.fetch_sub(1, Ordering::Relaxed);
     }
@@ -250,7 +250,7 @@ impl<T> std::ops::DerefMut for Guard<'_, T> {
     }
 }
 
-impl<'a, T> Guard<'a, T> {
+impl<T> Guard<'_, T> {
     /// Take the current value, and a new permit for it's deinitialization.
     ///
     /// The permit will be on a semaphore part of the new internal value, and any following
diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs
index d24c81ad0b..add2fa7920 100644
--- a/libs/utils/src/tracing_span_assert.rs
+++ b/libs/utils/src/tracing_span_assert.rs
@@ -184,23 +184,23 @@ mod tests {
 
     struct MemoryIdentity<'a>(&'a dyn Extractor);
 
-    impl<'a> MemoryIdentity<'a> {
+    impl MemoryIdentity<'_> {
         fn as_ptr(&self) -> *const () {
             self.0 as *const _ as *const ()
         }
     }
-    impl<'a> PartialEq for MemoryIdentity<'a> {
+    impl PartialEq for MemoryIdentity<'_> {
         fn eq(&self, other: &Self) -> bool {
             self.as_ptr() == other.as_ptr()
         }
     }
-    impl<'a> Eq for MemoryIdentity<'a> {}
-    impl<'a> Hash for MemoryIdentity<'a> {
+    impl Eq for MemoryIdentity<'_> {}
+    impl Hash for MemoryIdentity<'_> {
         fn hash<H: Hasher>(&self, state: &mut H) {
             self.as_ptr().hash(state);
         }
     }
-    impl<'a> fmt::Debug for MemoryIdentity<'a> {
+    impl fmt::Debug for MemoryIdentity<'_> {
         fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
             write!(f, "{:p}: {}", self.as_ptr(), self.0.id())
         }
diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs
index 8ed1d16082..9dbb6ecedf 100644
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -133,7 +133,7 @@ enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
     Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
     Unloaded(&'a E::DeltaLayer),
 }
-impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
+impl<E: CompactionJobExecutor> LazyLoadLayer<'_, E> {
     fn min_key(&self) -> E::Key {
         match self {
             Self::Loaded(entries) => entries.front().unwrap().key(),
@@ -147,23 +147,23 @@ impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
         }
     }
 }
-impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
+impl<E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'_, E> {
     fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
         Some(self.cmp(other))
     }
 }
-impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
+impl<E: CompactionJobExecutor> Ord for LazyLoadLayer<'_, E> {
     fn cmp(&self, other: &Self) -> std::cmp::Ordering {
         // reverse order so that we get a min-heap
         (other.min_key(), other.min_lsn()).cmp(&(self.min_key(), self.min_lsn()))
     }
 }
-impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
+impl<E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'_, E> {
     fn eq(&self, other: &Self) -> bool {
         self.cmp(other) == std::cmp::Ordering::Equal
     }
 }
-impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}
+impl<E: CompactionJobExecutor> Eq for LazyLoadLayer<'_, E> {}
 
 type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result<Vec<E>>>;
 
diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs
index 0325ee403a..1eb25d337b 100644
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -198,7 +198,7 @@ fn serialize_in_chunks<'a>(
         }
     }
 
-    impl<'a> ExactSizeIterator for Iter<'a> {}
+    impl ExactSizeIterator for Iter<'_> {}
 
     let buffer = bytes::BytesMut::new();
     let inner = input.chunks(chunk_size);
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 7ab2ba8742..ca44fbe6ae 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -654,7 +654,7 @@ impl std::fmt::Debug for EvictionCandidate {
         let ts = chrono::DateTime::<chrono::Utc>::from(self.last_activity_ts);
         let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true);
         struct DisplayIsDebug<'a, T>(&'a T);
-        impl<'a, T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'a, T> {
+        impl<T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'_, T> {
             fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
                 write!(f, "{}", self.0)
             }
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index b76efa5b48..3e824b59fb 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1189,7 +1189,7 @@ struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
     op: SmgrQueryType,
 }
 
-impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
+impl Drop for GlobalAndPerTimelineHistogramTimer<'_, '_> {
     fn drop(&mut self) {
         let elapsed = self.start.elapsed();
         let ex_throttled = self
@@ -1560,7 +1560,7 @@ impl BasebackupQueryTime {
     }
 }
 
-impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
+impl BasebackupQueryTimeOngoingRecording<'_, '_> {
     pub(crate) fn observe<T>(self, res: &Result<T, QueryError>) {
         let elapsed = self.start.elapsed();
         let ex_throttled = self
diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs
index 205605bc86..4e8be58d58 100644
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -90,7 +90,7 @@ pub mod mock {
                 let used_bytes = walk_dir_disk_usage(tenants_dir, name_filter.as_deref()).unwrap();
 
                 // round it up to the nearest block multiple
-                let used_blocks = (used_bytes + (blocksize - 1)) / blocksize;
+                let used_blocks = used_bytes.div_ceil(*blocksize);
 
                 if used_blocks > *total_blocks {
                     panic!(
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index 3afa3a86b9..1c82e5454d 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -50,13 +50,13 @@ impl From<PageReadGuard<'static>> for BlockLease<'static> {
 }
 
 #[cfg(test)]
-impl<'a> From<std::sync::Arc<[u8; PAGE_SZ]>> for BlockLease<'a> {
+impl From<std::sync::Arc<[u8; PAGE_SZ]>> for BlockLease<'_> {
     fn from(value: std::sync::Arc<[u8; PAGE_SZ]>) -> Self {
         BlockLease::Arc(value)
     }
 }
 
-impl<'a> Deref for BlockLease<'a> {
+impl Deref for BlockLease<'_> {
     type Target = [u8; PAGE_SZ];
 
     fn deref(&self) -> &Self::Target {
diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index 0107b0ac7e..b302cbc975 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -131,7 +131,7 @@ struct OnDiskNode<'a, const L: usize> {
     values: &'a [u8],
 }
 
-impl<'a, const L: usize> OnDiskNode<'a, L> {
+impl<const L: usize> OnDiskNode<'_, L> {
     ///
     /// Interpret a PAGE_SZ page as a node.
     ///
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 450084aca2..14b894d17c 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2182,7 +2182,7 @@ pub(crate) struct UploadQueueAccessor<'a> {
     inner: std::sync::MutexGuard<'a, UploadQueue>,
 }
 
-impl<'a> UploadQueueAccessor<'a> {
+impl UploadQueueAccessor<'_> {
     pub(crate) fn latest_uploaded_index_part(&self) -> &IndexPart {
         match &*self.inner {
             UploadQueue::Initialized(x) => &x.clean.0,
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index 0aad5bf392..e680fd705b 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -108,7 +108,6 @@ impl scheduler::Completion for WriteComplete {
 /// when we last did a write.  We only populate this after doing at least one
 /// write for a tenant -- this avoids holding state for tenants that have
 /// uploads disabled.
-
 struct UploaderTenantState {
     // This Weak only exists to enable culling idle instances of this type
     // when the Tenant has been deallocated.
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 99bd0ece57..a229b59560 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -705,7 +705,7 @@ pub mod tests {
 /// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
 struct RangeDisplayDebug<'a, T: std::fmt::Display>(&'a Range<T>);
 
-impl<'a, T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'a, T> {
+impl<T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'_, T> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "{}..{}", self.0.start, self.0.end)
     }
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 8be7d7876f..d1079876f8 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -529,8 +529,7 @@ impl DeltaLayerWriterInner {
         key_end: Key,
         ctx: &RequestContext,
     ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
-        let index_start_blk =
-            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
+        let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32;
 
         let mut file = self.blob_writer.into_inner(ctx).await?;
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index de8155f455..6c1a943470 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -828,8 +828,7 @@ impl ImageLayerWriterInner {
         ctx: &RequestContext,
         end_key: Option<Key>,
     ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
-        let index_start_blk =
-            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
+        let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32;
 
         // Calculate compression ratio
         let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index f29a33bae6..38a7cd09af 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -978,7 +978,7 @@ impl LayerInner {
         let timeline = self
             .timeline
             .upgrade()
-            .ok_or_else(|| DownloadError::TimelineShutdown)?;
+            .ok_or(DownloadError::TimelineShutdown)?;
 
         // count cancellations, which currently remain largely unexpected
         let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs
index ffe7ca5f3e..8e750e1187 100644
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -339,7 +339,7 @@ impl<'de> serde::Deserialize<'de> for LayerName {
 
 struct LayerNameVisitor;
 
-impl<'de> serde::de::Visitor<'de> for LayerNameVisitor {
+impl serde::de::Visitor<'_> for LayerNameVisitor {
     type Value = LayerName;
 
     fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index 0831fd9530..f91e27241d 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -99,21 +99,21 @@ impl<'a> PeekableLayerIterRef<'a> {
     }
 }
 
-impl<'a> std::cmp::PartialEq for IteratorWrapper<'a> {
+impl std::cmp::PartialEq for IteratorWrapper<'_> {
     fn eq(&self, other: &Self) -> bool {
         self.cmp(other) == Ordering::Equal
     }
 }
 
-impl<'a> std::cmp::Eq for IteratorWrapper<'a> {}
+impl std::cmp::Eq for IteratorWrapper<'_> {}
 
-impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> {
+impl std::cmp::PartialOrd for IteratorWrapper<'_> {
     fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
         Some(self.cmp(other))
     }
 }
 
-impl<'a> std::cmp::Ord for IteratorWrapper<'a> {
+impl std::cmp::Ord for IteratorWrapper<'_> {
     fn cmp(&self, other: &Self) -> std::cmp::Ordering {
         use std::cmp::Ordering;
         let a = self.peek_next_key_lsn_value();
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 792c769b4f..0c03791034 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -73,7 +73,7 @@ impl<'a> BufView<'a> {
     }
 }
 
-impl<'a> Deref for BufView<'a> {
+impl Deref for BufView<'_> {
     type Target = [u8];
 
     fn deref(&self) -> &Self::Target {
@@ -84,7 +84,7 @@ impl<'a> Deref for BufView<'a> {
     }
 }
 
-impl<'a> AsRef<[u8]> for BufView<'a> {
+impl AsRef<[u8]> for BufView<'_> {
     fn as_ref(&self) -> &[u8] {
         match self {
             BufView::Slice(slice) => slice,
@@ -196,11 +196,6 @@ pub(crate) struct ChunkedVectoredReadBuilder {
     max_read_size: Option<usize>,
 }
 
-/// Computes x / d rounded up.
-fn div_round_up(x: usize, d: usize) -> usize {
-    (x + (d - 1)) / d
-}
-
 impl ChunkedVectoredReadBuilder {
     const CHUNK_SIZE: usize = virtual_file::get_io_buffer_alignment();
     /// Start building a new vectored read.
@@ -220,7 +215,7 @@ impl ChunkedVectoredReadBuilder {
             .expect("First insertion always succeeds");
 
         let start_blk_no = start_offset as usize / Self::CHUNK_SIZE;
-        let end_blk_no = div_round_up(end_offset as usize, Self::CHUNK_SIZE);
+        let end_blk_no = (end_offset as usize).div_ceil(Self::CHUNK_SIZE);
         Self {
             start_blk_no,
             end_blk_no,
@@ -248,7 +243,7 @@ impl ChunkedVectoredReadBuilder {
     pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
         tracing::trace!(start, end, "trying to extend");
         let start_blk_no = start as usize / Self::CHUNK_SIZE;
-        let end_blk_no = div_round_up(end as usize, Self::CHUNK_SIZE);
+        let end_blk_no = (end as usize).div_ceil(Self::CHUNK_SIZE);
 
         let not_limited_by_max_read_size = {
             if let Some(max_read_size) = self.max_read_size {
@@ -975,12 +970,4 @@ mod tests {
         round_trip_test_compressed(&blobs, true).await?;
         Ok(())
     }
-
-    #[test]
-    fn test_div_round_up() {
-        const CHUNK_SIZE: usize = 512;
-        assert_eq!(1, div_round_up(200, CHUNK_SIZE));
-        assert_eq!(1, div_round_up(CHUNK_SIZE, CHUNK_SIZE));
-        assert_eq!(2, div_round_up(CHUNK_SIZE + 1, CHUNK_SIZE));
-    }
 }
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index d260116b38..5a364b7aaf 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -724,9 +724,9 @@ impl VirtualFileInner {
 
         *handle_guard = handle;
 
-        return Ok(FileGuard {
+        Ok(FileGuard {
             slot_guard: slot_guard.downgrade(),
-        });
+        })
     }
 
     pub fn remove(self) {
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index fa6bc4c6f5..465e427f7c 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -193,7 +193,7 @@ impl<'de> serde::de::Deserialize<'de> for IpPattern {
         D: serde::Deserializer<'de>,
     {
         struct StrVisitor;
-        impl<'de> serde::de::Visitor<'de> for StrVisitor {
+        impl serde::de::Visitor<'_> for StrVisitor {
             type Value = IpPattern;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 0d5ebd88f9..3baa7ec751 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -558,7 +558,7 @@ pub struct RetryConfig {
 }
 
 impl RetryConfig {
-    /// Default options for RetryConfig.
+    // Default options for RetryConfig.
 
     /// Total delay for 5 retries with 200ms base delay and 2 backoff factor is about 6s.
     pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str =
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index b0ad0e4566..3432ac5ff6 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -104,7 +104,7 @@ struct Options<'a> {
     options: &'a StartupMessageParams,
 }
 
-impl<'a> serde::Serialize for Options<'a> {
+impl serde::Serialize for Options<'_> {
     fn serialize<S>(&self, s: S) -> Result<S::Ok, S::Error>
     where
         S: serde::Serializer,
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index 09fd9657d0..49aab917e4 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -55,7 +55,7 @@ impl<Id: InternId> std::ops::Deref for InternedString<Id> {
 impl<'de, Id: InternId> serde::de::Deserialize<'de> for InternedString<Id> {
     fn deserialize<D: serde::de::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
         struct Visitor<Id>(PhantomData<Id>);
-        impl<'de, Id: InternId> serde::de::Visitor<'de> for Visitor<Id> {
+        impl<Id: InternId> serde::de::Visitor<'_> for Visitor<Id> {
             type Value = InternedString<Id>;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 74bc778a36..a7b3d45c95 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -76,11 +76,7 @@
     )
 )]
 // List of temporarily allowed lints to unblock beta/nightly.
-#![allow(
-    unknown_lints,
-    // TODO: 1.82: Add `use<T>` where necessary and remove from this list.
-    impl_trait_overcaptures,
-)]
+#![allow(unknown_lints)]
 
 use std::convert::Infallible;
 
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 88175d73b1..3f54b0661b 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -73,11 +73,11 @@ impl ClientConfig<'_> {
         self,
     ) -> anyhow::Result<
         impl tokio_postgres::tls::TlsConnect<
-            S,
-            Error = impl std::fmt::Debug,
-            Future = impl Send,
-            Stream = RustlsStream<S>,
-        >,
+                S,
+                Error = impl std::fmt::Debug + use<S>,
+                Future = impl Send + use<S>,
+                Stream = RustlsStream<S>,
+            > + use<S>,
     > {
         let mut mk = MakeRustlsConnect::new(self.config);
         let tls = MakeTlsConnect::<S>::make_tls_connect(&mut mk, self.hostname)?;
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index 493295c938..6a13f645a5 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -218,16 +218,12 @@ impl sasl::Mechanism for Exchange<'_> {
                         self.state = ExchangeState::SaltSent(sent);
                         Ok(Step::Continue(self, msg))
                     }
-                    #[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match
-                    Step::Success(x, _) => match x {},
                     Step::Failure(msg) => Ok(Step::Failure(msg)),
                 }
             }
             ExchangeState::SaltSent(sent) => {
                 match sent.transition(self.secret, &self.tls_server_end_point, input)? {
                     Step::Success(keys, msg) => Ok(Step::Success(keys, msg)),
-                    #[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match
-                    Step::Continue(x, _) => match x {},
                     Step::Failure(msg) => Ok(Step::Failure(msg)),
                 }
             }
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index b97c656510..8401e3a1c9 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -11,13 +11,6 @@ use tokio_postgres::tls::NoTlsStream;
 use tokio_postgres::{AsyncMessage, Socket};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
-
-use crate::context::RequestMonitoring;
-use crate::control_plane::messages::MetricsAuxInfo;
-use crate::metrics::Metrics;
-
-use super::conn_pool_lib::{Client, ClientInnerExt, ConnInfo, GlobalConnPool};
-
 #[cfg(test)]
 use {
     super::conn_pool_lib::GlobalConnPoolOptions,
@@ -25,6 +18,11 @@ use {
     std::{sync::atomic, time::Duration},
 };
 
+use super::conn_pool_lib::{Client, ClientInnerExt, ConnInfo, GlobalConnPool};
+use crate::context::RequestMonitoring;
+use crate::control_plane::messages::MetricsAuxInfo;
+use crate::metrics::Metrics;
+
 #[derive(Debug, Clone)]
 pub(crate) struct ConnInfoWithAuth {
     pub(crate) conn_info: ConnInfo,
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
index 6e964ce878..844730194d 100644
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -1,25 +1,23 @@
+use std::collections::HashMap;
+use std::ops::Deref;
+use std::sync::atomic::{self, AtomicUsize};
+use std::sync::{Arc, Weak};
+use std::time::Duration;
+
 use dashmap::DashMap;
 use parking_lot::RwLock;
 use rand::Rng;
-use std::{collections::HashMap, sync::Arc, sync::Weak, time::Duration};
-use std::{
-    ops::Deref,
-    sync::atomic::{self, AtomicUsize},
-};
 use tokio_postgres::ReadyForQueryStatus;
+use tracing::{debug, info, Span};
 
+use super::backend::HttpConnError;
+use super::conn_pool::ClientInnerRemote;
+use crate::auth::backend::ComputeUserInfo;
+use crate::context::RequestMonitoring;
 use crate::control_plane::messages::ColdStartInfo;
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
-use crate::{
-    auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName,
-};
-
-use super::conn_pool::ClientInnerRemote;
-use tracing::info;
-use tracing::{debug, Span};
-
-use super::backend::HttpConnError;
+use crate::{DbName, EndpointCacheKey, RoleName};
 
 #[derive(Debug, Clone)]
 pub(crate) struct ConnInfo {
@@ -482,7 +480,7 @@ impl<C: ClientInnerExt> Client<C> {
         })
     }
 
-    pub(crate) fn do_drop(&mut self) -> Option<impl FnOnce()> {
+    pub(crate) fn do_drop(&mut self) -> Option<impl FnOnce() + use<C>> {
         let conn_info = self.conn_info.clone();
         let client = self
             .inner
diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index 79bb19328f..363e397976 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -10,12 +10,11 @@ use rand::Rng;
 use tokio::net::TcpStream;
 use tracing::{debug, error, info, info_span, Instrument};
 
+use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
 use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
-
-use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
 use crate::EndpointCacheKey;
 
 pub(crate) type Send = http2::SendRequest<hyper::body::Incoming>;
diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs
index 8c56d317cc..569e2da571 100644
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -155,10 +155,10 @@ fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, Json
 // dimensions, we just return them as is.
 //
 fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result<Value, JsonConversionError> {
-    _pg_array_parse(pg_array, elem_type, false).map(|(v, _)| v)
+    pg_array_parse_inner(pg_array, elem_type, false).map(|(v, _)| v)
 }
 
-fn _pg_array_parse(
+fn pg_array_parse_inner(
     pg_array: &str,
     elem_type: &Type,
     nested: bool,
@@ -211,7 +211,7 @@ fn _pg_array_parse(
             '{' if !quote => {
                 level += 1;
                 if level > 1 {
-                    let (res, off) = _pg_array_parse(&pg_array[i..], elem_type, true)?;
+                    let (res, off) = pg_array_parse_inner(&pg_array[i..], elem_type, true)?;
                     entries.push(res);
                     for _ in 0..off - 1 {
                         pg_array_chr.next();
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index c4fdd00f78..a01afd2820 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -25,7 +25,6 @@ use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::Metrics;
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
-
 use crate::{DbName, RoleName};
 
 struct ConnPoolEntry<C: ClientInnerExt> {
@@ -530,7 +529,7 @@ impl<C: ClientInnerExt> LocalClient<C> {
         })
     }
 
-    fn do_drop(&mut self) -> Option<impl FnOnce()> {
+    fn do_drop(&mut self) -> Option<impl FnOnce() + use<C>> {
         let conn_info = self.conn_info.clone();
         let client = self
             .inner
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index bb5eb390a6..6fbb044669 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -38,7 +38,6 @@ use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::metrics::{HttpDirection, Metrics};
 use crate::proxy::{run_until_cancelled, NeonOptions};
 use crate::serverless::backend::HttpConnError;
-
 use crate::usage_metrics::{MetricCounter, MetricCounterRecorder};
 use crate::{DbName, RoleName};
 
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index c5384c0b0e..f944d5aec3 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -375,7 +375,7 @@ pub async fn task_backup(
         let now = Utc::now();
         collect_metrics_backup_iteration(
             &USAGE_METRICS.backup_endpoints,
-            &storage,
+            storage.as_ref(),
             &hostname,
             prev,
             now,
@@ -395,7 +395,7 @@ pub async fn task_backup(
 #[instrument(skip_all)]
 async fn collect_metrics_backup_iteration(
     endpoints: &DashMap<Ids, Arc<MetricBackupCounter>, FastHasher>,
-    storage: &Option<GenericRemoteStorage>,
+    storage: Option<&GenericRemoteStorage>,
     hostname: &str,
     prev: DateTime<Utc>,
     now: DateTime<Utc>,
@@ -446,7 +446,7 @@ async fn collect_metrics_backup_iteration(
 }
 
 async fn upload_events_chunk(
-    storage: &Option<GenericRemoteStorage>,
+    storage: Option<&GenericRemoteStorage>,
     chunk: EventChunk<'_, Event<Ids, &'static str>>,
     remote_path: &RemotePath,
     cancel: &CancellationToken,
@@ -577,10 +577,10 @@ mod tests {
         // counter is unregistered
         assert!(metrics.endpoints.is_empty());
 
-        collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000)
+        collect_metrics_backup_iteration(&metrics.backup_endpoints, None, "foo", now, now, 1000)
             .await;
         assert!(!metrics.backup_endpoints.is_empty());
-        collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000)
+        collect_metrics_backup_iteration(&metrics.backup_endpoints, None, "foo", now, now, 1000)
             .await;
         // backup counter is unregistered after the second iteration
         assert!(metrics.backup_endpoints.is_empty());
diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs
index 7e07f6a2af..330e73f02f 100644
--- a/proxy/src/waiters.rs
+++ b/proxy/src/waiters.rs
@@ -73,7 +73,7 @@ struct DropKey<'a, T> {
     registry: &'a Waiters<T>,
 }
 
-impl<'a, T> Drop for DropKey<'a, T> {
+impl<T> Drop for DropKey<'_, T> {
     fn drop(&mut self) {
         self.registry.0.lock().remove(&self.key);
     }
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 3494b0b764..41b9490088 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -122,7 +122,7 @@ impl<'a> WriteGuardSharedState<'a> {
     }
 }
 
-impl<'a> Deref for WriteGuardSharedState<'a> {
+impl Deref for WriteGuardSharedState<'_> {
     type Target = SharedState;
 
     fn deref(&self) -> &Self::Target {
@@ -130,13 +130,13 @@ impl<'a> Deref for WriteGuardSharedState<'a> {
     }
 }
 
-impl<'a> DerefMut for WriteGuardSharedState<'a> {
+impl DerefMut for WriteGuardSharedState<'_> {
     fn deref_mut(&mut self) -> &mut Self::Target {
         &mut self.guard
     }
 }
 
-impl<'a> Drop for WriteGuardSharedState<'a> {
+impl Drop for WriteGuardSharedState<'_> {
     fn drop(&mut self) {
         let term_flush_lsn =
             TermLsn::from((self.guard.sk.last_log_term(), self.guard.sk.flush_lsn()));

From 24654b8eee8706e8ae98948733a28b56df83536b Mon Sep 17 00:00:00 2001
From: Jere Vaara <jerevaara@gmail.com>
Date: Fri, 18 Oct 2024 13:25:45 +0300
Subject: [PATCH 15/48] compute_ctl: Add endpoint that allows setting role
 grants (#9395)

This PR introduces a `/grants` endpoint which allows setting specific
`privileges` to certain `role` for a certain `schema`.

Related to #9344

Together these endpoints will be used to configure JWT extension and set
correct usage to its schema to specific roles that will need them.

---------

Co-authored-by: Conrad Ludgate <conradludgate@gmail.com>
---
 compute_tools/src/compute.rs             | 43 ++++++++++++
 compute_tools/src/http/api.rs            | 48 ++++++++++++-
 compute_tools/src/http/openapi_spec.yaml | 89 ++++++++++++++++++++++++
 libs/compute_api/src/lib.rs              |  1 +
 libs/compute_api/src/privilege.rs        | 35 ++++++++++
 libs/compute_api/src/requests.rs         | 13 +++-
 libs/compute_api/src/responses.rs        | 13 +++-
 test_runner/fixtures/endpoint/http.py    |  8 +++
 test_runner/regress/test_role_grants.py  | 41 +++++++++++
 9 files changed, 287 insertions(+), 4 deletions(-)
 create mode 100644 libs/compute_api/src/privilege.rs
 create mode 100644 test_runner/regress/test_role_grants.py

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 6aec008f3a..11fee73f03 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -15,6 +15,7 @@ use std::time::Instant;
 
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
+use compute_api::spec::PgIdent;
 use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
@@ -25,6 +26,7 @@ use tracing::{debug, error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 
+use compute_api::privilege::Privilege;
 use compute_api::responses::{ComputeMetrics, ComputeStatus};
 use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;
@@ -1373,6 +1375,47 @@ LIMIT 100",
         download_size
     }
 
+    pub async fn set_role_grants(
+        &self,
+        db_name: &PgIdent,
+        schema_name: &PgIdent,
+        privileges: &[Privilege],
+        role_name: &PgIdent,
+    ) -> Result<()> {
+        use tokio_postgres::config::Config;
+        use tokio_postgres::NoTls;
+
+        let mut conf = Config::from_str(self.connstr.as_str()).unwrap();
+        conf.dbname(db_name);
+
+        let (db_client, conn) = conf
+            .connect(NoTls)
+            .await
+            .context("Failed to connect to the database")?;
+        tokio::spawn(conn);
+
+        // TODO: support other types of grants apart from schemas?
+        let query = format!(
+            "GRANT {} ON SCHEMA {} TO {}",
+            privileges
+                .iter()
+                // should not be quoted as it's part of the command.
+                // is already sanitized so it's ok
+                .map(|p| p.as_str())
+                .collect::<Vec<&'static str>>()
+                .join(", "),
+            // quote the schema and role name as identifiers to sanitize them.
+            schema_name.pg_quote(),
+            role_name.pg_quote(),
+        );
+        db_client
+            .simple_query(&query)
+            .await
+            .with_context(|| format!("Failed to execute query: {}", query))?;
+
+        Ok(())
+    }
+
     #[tokio::main]
     pub async fn prepare_preload_libraries(
         &self,
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index 79e6158081..133ab9f5af 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -9,8 +9,10 @@ use crate::catalog::SchemaDumpError;
 use crate::catalog::{get_database_schema, get_dbs_and_roles};
 use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
-use compute_api::requests::ConfigurationRequest;
-use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
+use compute_api::requests::{ConfigurationRequest, SetRoleGrantsRequest};
+use compute_api::responses::{
+    ComputeStatus, ComputeStatusResponse, GenericAPIError, SetRoleGrantsResponse,
+};
 
 use anyhow::Result;
 use hyper::header::CONTENT_TYPE;
@@ -165,6 +167,48 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
             }
         }
 
+        (&Method::POST, "/grants") => {
+            info!("serving /grants POST request");
+            let status = compute.get_status();
+            if status != ComputeStatus::Running {
+                let msg = format!(
+                    "invalid compute status for set_role_grants request: {:?}",
+                    status
+                );
+                error!(msg);
+                return render_json_error(&msg, StatusCode::PRECONDITION_FAILED);
+            }
+
+            let request = hyper::body::to_bytes(req.into_body()).await.unwrap();
+            let request = serde_json::from_slice::<SetRoleGrantsRequest>(&request).unwrap();
+
+            let res = compute
+                .set_role_grants(
+                    &request.database,
+                    &request.schema,
+                    &request.privileges,
+                    &request.role,
+                )
+                .await;
+            match res {
+                Ok(()) => render_json(Body::from(
+                    serde_json::to_string(&SetRoleGrantsResponse {
+                        database: request.database,
+                        schema: request.schema,
+                        role: request.role,
+                        privileges: request.privileges,
+                    })
+                    .unwrap(),
+                )),
+                Err(e) => render_json_error(
+                    &format!("could not grant role privileges to the schema: {e}"),
+                    // TODO: can we filter on role/schema not found errors
+                    // and return appropriate error code?
+                    StatusCode::INTERNAL_SERVER_ERROR,
+                ),
+            }
+        }
+
         // get the list of installed extensions
         // currently only used in python tests
         // TODO: call it from cplane
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index e9fa66b323..73dbdc3ee9 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -127,6 +127,41 @@ paths:
               schema:
                 $ref: "#/components/schemas/GenericError"
 
+  /grants:
+    post:
+      tags:
+        - Grants
+      summary: Apply grants to the database.
+      description: ""
+      operationId: setRoleGrants
+      requestBody:
+        description: Grants request.
+        required: true
+        content:
+          application/json:
+            schema:
+                $ref: "#/components/schemas/SetRoleGrantsRequest"
+      responses:
+        200:
+          description: Grants applied.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/SetRoleGrantsResponse"
+        412:
+          description: |
+            Compute is not in the right state for processing the request.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+        500:
+          description: Error occurred during grants application.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+
   /check_writability:
     post:
       tags:
@@ -427,6 +462,60 @@ components:
               n_databases:
                 type: integer
 
+    SetRoleGrantsRequest:
+      type: object
+      required:
+        - database
+        - schema
+        - privileges
+        - role
+      properties:
+        database:
+          type: string
+          description: Database name.
+          example: "neondb"
+        schema:
+          type: string
+          description: Schema name.
+          example: "public"
+        privileges:
+          type: array
+          items:
+            type: string
+          description: List of privileges to set.
+          example: ["SELECT", "INSERT"]
+        role:
+          type: string
+          description: Role name.
+          example: "neon"
+
+    SetRoleGrantsResponse:
+      type: object
+      required:
+        - database
+        - schema
+        - privileges
+        - role
+      properties:
+        database:
+          type: string
+          description: Database name.
+          example: "neondb"
+        schema:
+          type: string
+          description: Schema name.
+          example: "public"
+        privileges:
+          type: array
+          items:
+            type: string
+          description: List of privileges set.
+          example: ["SELECT", "INSERT"]
+        role:
+          type: string
+          description: Role name.
+          example: "neon"
+
     #
     # Errors
     #
diff --git a/libs/compute_api/src/lib.rs b/libs/compute_api/src/lib.rs
index 210a52d089..f4f3d92fc6 100644
--- a/libs/compute_api/src/lib.rs
+++ b/libs/compute_api/src/lib.rs
@@ -1,5 +1,6 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
+pub mod privilege;
 pub mod requests;
 pub mod responses;
 pub mod spec;
diff --git a/libs/compute_api/src/privilege.rs b/libs/compute_api/src/privilege.rs
new file mode 100644
index 0000000000..dc0d870946
--- /dev/null
+++ b/libs/compute_api/src/privilege.rs
@@ -0,0 +1,35 @@
+#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
+#[serde(rename_all = "UPPERCASE")]
+pub enum Privilege {
+    Select,
+    Insert,
+    Update,
+    Delete,
+    Truncate,
+    References,
+    Trigger,
+    Usage,
+    Create,
+    Connect,
+    Temporary,
+    Execute,
+}
+
+impl Privilege {
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            Privilege::Select => "SELECT",
+            Privilege::Insert => "INSERT",
+            Privilege::Update => "UPDATE",
+            Privilege::Delete => "DELETE",
+            Privilege::Truncate => "TRUNCATE",
+            Privilege::References => "REFERENCES",
+            Privilege::Trigger => "TRIGGER",
+            Privilege::Usage => "USAGE",
+            Privilege::Create => "CREATE",
+            Privilege::Connect => "CONNECT",
+            Privilege::Temporary => "TEMPORARY",
+            Privilege::Execute => "EXECUTE",
+        }
+    }
+}
diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs
index 5896c7dc65..fbc7577dd9 100644
--- a/libs/compute_api/src/requests.rs
+++ b/libs/compute_api/src/requests.rs
@@ -1,6 +1,9 @@
 //! Structs representing the JSON formats used in the compute_ctl's HTTP API.
 
-use crate::spec::ComputeSpec;
+use crate::{
+    privilege::Privilege,
+    spec::{ComputeSpec, PgIdent},
+};
 use serde::Deserialize;
 
 /// Request of the /configure API
@@ -12,3 +15,11 @@ use serde::Deserialize;
 pub struct ConfigurationRequest {
     pub spec: ComputeSpec,
 }
+
+#[derive(Deserialize, Debug)]
+pub struct SetRoleGrantsRequest {
+    pub database: PgIdent,
+    pub schema: PgIdent,
+    pub privileges: Vec<Privilege>,
+    pub role: PgIdent,
+}
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index 5023fce003..fadf524273 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -6,7 +6,10 @@ use std::fmt::Display;
 use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize, Serializer};
 
-use crate::spec::{ComputeSpec, Database, Role};
+use crate::{
+    privilege::Privilege,
+    spec::{ComputeSpec, Database, PgIdent, Role},
+};
 
 #[derive(Serialize, Debug, Deserialize)]
 pub struct GenericAPIError {
@@ -168,3 +171,11 @@ pub struct InstalledExtension {
 pub struct InstalledExtensions {
     pub extensions: Vec<InstalledExtension>,
 }
+
+#[derive(Clone, Debug, Default, Serialize)]
+pub struct SetRoleGrantsResponse {
+    pub database: PgIdent,
+    pub schema: PgIdent,
+    pub privileges: Vec<Privilege>,
+    pub role: PgIdent,
+}
diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py
index 26895df8a6..e7b014b4a9 100644
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -28,3 +28,11 @@ class EndpointHttpClient(requests.Session):
         res = self.get(f"http://localhost:{self.port}/installed_extensions")
         res.raise_for_status()
         return res.json()
+
+    def set_role_grants(self, database: str, role: str, schema: str, privileges: list[str]):
+        res = self.post(
+            f"http://localhost:{self.port}/grants",
+            json={"database": database, "schema": schema, "role": role, "privileges": privileges},
+        )
+        res.raise_for_status()
+        return res.json()
diff --git a/test_runner/regress/test_role_grants.py b/test_runner/regress/test_role_grants.py
new file mode 100644
index 0000000000..b2251875f0
--- /dev/null
+++ b/test_runner/regress/test_role_grants.py
@@ -0,0 +1,41 @@
+import psycopg2
+from fixtures.neon_fixtures import NeonEnv
+
+
+def test_role_grants(neon_simple_env: NeonEnv):
+    """basic test for the endpoint that grants permissions for a role against a schema"""
+
+    env = neon_simple_env
+
+    env.create_branch("test_role_grants")
+
+    endpoint = env.endpoints.create_start("test_role_grants")
+
+    endpoint.safe_psql("CREATE DATABASE test_role_grants")
+    endpoint.safe_psql("CREATE SCHEMA IF NOT EXISTS test_schema", dbname="test_role_grants")
+    endpoint.safe_psql("CREATE ROLE test_role WITH LOGIN", dbname="test_role_grants")
+
+    # confirm we do not yet have access
+    pg_conn = endpoint.connect(dbname="test_role_grants", user="test_role")
+    with pg_conn.cursor() as cur:
+        try:
+            cur.execute('CREATE TABLE "test_schema"."test_table" (id integer primary key)')
+            raise ValueError("create table should not succeed")
+        except psycopg2.errors.InsufficientPrivilege:
+            pass
+        except BaseException as e:
+            raise e
+
+    client = endpoint.http_client()
+    res = client.set_role_grants(
+        "test_role_grants", "test_role", "test_schema", ["CREATE", "USAGE"]
+    )
+
+    # confirm we have access
+    with pg_conn.cursor() as cur:
+        cur.execute('CREATE TABLE "test_schema"."test_table" (id integer primary key)')
+        cur.execute('INSERT INTO "test_schema"."test_table" (id) VALUES (1)')
+        cur.execute('SELECT id from "test_schema"."test_table"')
+        res = cur.fetchall()
+
+        assert res == [(1,)], "select should not succeed"

From b7173b1ef05f694f3fa7968dadc4a298ea6d66e8 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 18 Oct 2024 11:29:23 +0100
Subject: [PATCH 16/48] storcon: fix case where we might fail to send compute
 notifications after two opposite migrations  (#9435)

## Problem

If we migrate A->B, then B->A, and the notification of A->B fails, then
we might have retained state that makes us think "A" is the last state
we sent to the compute hook, whereas when we migrate B->A we should
really be sending a fresh notification in case our earlier failed
notification has actually mutated the remote compute config.

Closes: #9417

## Summary of changes

- Add a reproducer for the bug
(`test_storage_controller_compute_hook_revert`)
- Refactor compute hook code to represent remote state with
`ComputeRemoteState` which stores a boolean for whether the compute has
fully applied the change as well as the request that the compute
accepted.
- The actual bug fix: after sending a compute notification, if we got a
423 response then update our ComputeRemoteState to reflect that we have
mutated the remote state. This way, when we later try and notify for our
historic location, we will properly see that as a change and send the
notification.

Co-authored-by: Vlad Lazar <vlad@neon.tech>
---
 storage_controller/src/compute_hook.rs        |  80 ++++++++---
 .../regress/test_storage_controller.py        | 127 ++++++++++++++++--
 2 files changed, 183 insertions(+), 24 deletions(-)

diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index bafae1f551..b63a322b87 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -28,7 +28,7 @@ struct UnshardedComputeHookTenant {
     node_id: NodeId,
 
     // Must hold this lock to send a notification.
-    send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
+    send_lock: Arc<tokio::sync::Mutex<Option<ComputeRemoteState>>>,
 }
 struct ShardedComputeHookTenant {
     stripe_size: ShardStripeSize,
@@ -38,7 +38,22 @@ struct ShardedComputeHookTenant {
     // Must hold this lock to send a notification.  The contents represent
     // the last successfully sent notification, and are used to coalesce multiple
     // updates by only sending when there is a chance since our last successful send.
-    send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
+    send_lock: Arc<tokio::sync::Mutex<Option<ComputeRemoteState>>>,
+}
+
+/// Represents our knowledge of the compute's state: we can update this when we get a
+/// response from a notify API call, which tells us what has been applied.
+///
+/// Should be wrapped in an Option<>, as we cannot always know the remote state.
+#[derive(PartialEq, Eq, Debug)]
+struct ComputeRemoteState {
+    // The request body which was acked by the compute
+    request: ComputeHookNotifyRequest,
+
+    // Whether the cplane indicated that the state was applied to running computes, or just
+    // persisted.  In the Neon control plane, this is the difference between a 423 response (meaning
+    // persisted but not applied), and a 2xx response (both persisted and applied)
+    applied: bool,
 }
 
 enum ComputeHookTenant {
@@ -64,7 +79,7 @@ impl ComputeHookTenant {
         }
     }
 
-    fn get_send_lock(&self) -> &Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>> {
+    fn get_send_lock(&self) -> &Arc<tokio::sync::Mutex<Option<ComputeRemoteState>>> {
         match self {
             Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock,
             Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock,
@@ -188,11 +203,11 @@ enum MaybeSendResult {
     Transmit(
         (
             ComputeHookNotifyRequest,
-            tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>,
+            tokio::sync::OwnedMutexGuard<Option<ComputeRemoteState>>,
         ),
     ),
     // Something requires sending, but you must wait for a current sender then call again
-    AwaitLock(Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>),
+    AwaitLock(Arc<tokio::sync::Mutex<Option<ComputeRemoteState>>>),
     // Nothing requires sending
     Noop,
 }
@@ -201,7 +216,7 @@ impl ComputeHookTenant {
     fn maybe_send(
         &self,
         tenant_id: TenantId,
-        lock: Option<tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>>,
+        lock: Option<tokio::sync::OwnedMutexGuard<Option<ComputeRemoteState>>>,
     ) -> MaybeSendResult {
         let locked = match lock {
             Some(already_locked) => already_locked,
@@ -257,11 +272,22 @@ impl ComputeHookTenant {
                 tracing::info!("Tenant isn't yet ready to emit a notification");
                 MaybeSendResult::Noop
             }
-            Some(request) if Some(&request) == locked.as_ref() => {
-                // No change from the last value successfully sent
+            Some(request)
+                if Some(&request) == locked.as_ref().map(|s| &s.request)
+                    && locked.as_ref().map(|s| s.applied).unwrap_or(false) =>
+            {
+                tracing::info!(
+                    "Skipping notification because remote state already matches ({:?})",
+                    &request
+                );
+                // No change from the last value successfully sent, and our state indicates that the last
+                // value sent was fully applied on the control plane side.
                 MaybeSendResult::Noop
             }
-            Some(request) => MaybeSendResult::Transmit((request, locked)),
+            Some(request) => {
+                // Our request differs from the last one sent, or the last one sent was not fully applied on the compute side
+                MaybeSendResult::Transmit((request, locked))
+            }
         }
     }
 }
@@ -550,10 +576,28 @@ impl ComputeHook {
             })
         };
 
-        if result.is_ok() {
-            // Before dropping the send lock, stash the request we just sent so that
-            // subsequent callers can avoid redundantly re-sending the same thing.
-            *send_lock_guard = Some(request);
+        match result {
+            Ok(_) => {
+                // Before dropping the send lock, stash the request we just sent so that
+                // subsequent callers can avoid redundantly re-sending the same thing.
+                *send_lock_guard = Some(ComputeRemoteState {
+                    request,
+                    applied: true,
+                });
+            }
+            Err(NotifyError::Busy) => {
+                // Busy result means that the server responded and has stored the new configuration,
+                // but was not able to fully apply it to the compute
+                *send_lock_guard = Some(ComputeRemoteState {
+                    request,
+                    applied: false,
+                });
+            }
+            Err(_) => {
+                // General error case: we can no longer know the remote state, so clear it.  This will result in
+                // the logic in maybe_send recognizing that we should call the hook again.
+                *send_lock_guard = None;
+            }
         }
         result
     }
@@ -707,7 +751,10 @@ pub(crate) mod tests {
         assert!(request.stripe_size.is_none());
 
         // Simulate successful send
-        *guard = Some(request);
+        *guard = Some(ComputeRemoteState {
+            request,
+            applied: true,
+        });
         drop(guard);
 
         // Try asking again: this should be a no-op
@@ -750,7 +797,10 @@ pub(crate) mod tests {
         assert_eq!(request.stripe_size, Some(ShardStripeSize(32768)));
 
         // Simulate successful send
-        *guard = Some(request);
+        *guard = Some(ComputeRemoteState {
+            request,
+            applied: true,
+        });
         drop(guard);
 
         Ok(())
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 1dcc37c407..a4e293da9e 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -576,6 +576,14 @@ def test_storage_controller_compute_hook(
     env.storage_controller.consistency_check()
 
 
+NOTIFY_BLOCKED_LOG = ".*Live migration blocked.*"
+NOTIFY_FAILURE_LOGS = [
+    ".*Failed to notify compute.*",
+    ".*Reconcile error.*Cancelled",
+    ".*Reconcile error.*Control plane tenant busy",
+]
+
+
 def test_storage_controller_stuck_compute_hook(
     httpserver: HTTPServer,
     neon_env_builder: NeonEnvBuilder,
@@ -620,15 +628,8 @@ def test_storage_controller_stuck_compute_hook(
     dest_pageserver = env.get_pageserver(dest_ps_id)
     shard_0_id = TenantShardId(tenant_id, 0, 0)
 
-    NOTIFY_BLOCKED_LOG = ".*Live migration blocked.*"
-    env.storage_controller.allowed_errors.extend(
-        [
-            NOTIFY_BLOCKED_LOG,
-            ".*Failed to notify compute.*",
-            ".*Reconcile error.*Cancelled",
-            ".*Reconcile error.*Control plane tenant busy",
-        ]
-    )
+    env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
+    env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
 
     with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
         # We expect the controller to hit the 423 (locked) and retry.  Migration shouldn't complete until that
@@ -719,6 +720,114 @@ def test_storage_controller_stuck_compute_hook(
     env.storage_controller.consistency_check()
 
 
+@run_only_on_default_postgres("this test doesn't start an endpoint")
+def test_storage_controller_compute_hook_revert(
+    httpserver: HTTPServer,
+    neon_env_builder: NeonEnvBuilder,
+    httpserver_listen_address,
+):
+    """
+    'revert' in the sense of a migration which gets reversed shortly after, as may happen during
+    a rolling upgrade.
+
+    This is a reproducer for https://github.com/neondatabase/neon/issues/9417
+
+    The buggy behavior was that when the compute hook gave us errors, we assumed our last successfully
+    sent state was still in effect, so when migrating back to the original pageserver we didn't bother
+    notifying of that.  This is wrong because even a failed request might mutate the state on the server.
+    """
+
+    # We will run two pageserver to migrate and check that the storage controller sends notifications
+    # when migrating.
+    neon_env_builder.num_pageservers = 2
+    (host, port) = httpserver_listen_address
+    neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify"
+
+    # Set up fake HTTP notify endpoint
+    notifications = []
+
+    handle_params = {"status": 200}
+
+    def handler(request: Request):
+        status = handle_params["status"]
+        log.info(f"Notify request[{status}]: {request}")
+        notifications.append(request.json)
+        return Response(status=status)
+
+    httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
+
+    # Start running
+    env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"})
+    tenant_id = env.initial_tenant
+    tenant_shard_id = TenantShardId(tenant_id, 0, 0)
+
+    pageserver_a = env.get_tenant_pageserver(tenant_id)
+    pageserver_b = [p for p in env.pageservers if p.id != pageserver_a.id][0]
+
+    def notified_ps(ps_id: int) -> None:
+        latest = notifications[-1]
+        log.info(f"Waiting for {ps_id}, have {latest}")
+        assert latest is not None
+        assert latest["shards"] is not None
+        assert latest["shards"][0]["node_id"] == ps_id
+
+    wait_until(30, 1, lambda: notified_ps(pageserver_a.id))
+
+    env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
+    env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
+
+    # Migrate A -> B, and make notifications fail while this is happening
+    handle_params["status"] = 423
+
+    with pytest.raises(StorageControllerApiException, match="Timeout waiting for shard"):
+        # We expect the controller to give us an error because its reconciliation timed out
+        # waiting for the compute hook.
+        env.storage_controller.tenant_shard_migrate(tenant_shard_id, pageserver_b.id)
+
+    # Although the migration API failed, the hook should still see pageserver B (it remembers what
+    # was posted even when returning an error code)
+    wait_until(30, 1, lambda: notified_ps(pageserver_b.id))
+
+    # Although the migration API failed, the tenant should still have moved to the right pageserver
+    assert len(pageserver_b.http_client().tenant_list()) == 1
+
+    # Before we clear the failure on the migration hook, we need the controller to give up
+    # trying to notify about B -- the bug case we're reproducing is when the controller
+    # _never_ successfully notified for B, then tries to notify for A.
+    #
+    # The controller will give up notifying if the origin of a migration becomes unavailable.
+    pageserver_a.stop()
+
+    # Preempt heartbeats for a faster test
+    env.storage_controller.node_configure(pageserver_a.id, {"availability": "Offline"})
+
+    def logged_giving_up():
+        env.storage_controller.assert_log_contains(".*Giving up on compute notification.*")
+
+    wait_until(30, 1, logged_giving_up)
+
+    pageserver_a.start()
+
+    # Preempt heartbeats for determinism
+    env.storage_controller.node_configure(pageserver_a.id, {"availability": "Active"})
+    # Starting node will prompt a reconcile to clean up old AttachedStale location, for a deterministic test
+    # we want that complete before we start our migration.  Tolerate failure because our compute hook is
+    # still configured to fail
+    try:
+        env.storage_controller.reconcile_all()
+    except StorageControllerApiException as e:
+        # This exception _might_ be raised: it depends if our reconcile_all hit the on-node-activation
+        # Reconciler lifetime or ran after it already completed.
+        log.info(f"Expected error from reconcile_all: {e}")
+
+    # Migrate B -> A, with a working compute hook: the controller should notify the hook because the
+    # last update it made that was acked (423) by the compute was for node B.
+    handle_params["status"] = 200
+    env.storage_controller.tenant_shard_migrate(tenant_shard_id, pageserver_a.id)
+
+    wait_until(30, 1, lambda: notified_ps(pageserver_a.id))
+
+
 def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
     """
     Verify that occasional-use debug APIs work as expected.  This is a lightweight test

From 98fee7a97d68db55049583d403dcb21755bc4db5 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Fri, 18 Oct 2024 13:31:14 +0300
Subject: [PATCH 17/48] Increase shared_buffers in
 test_subscriber_synchronous_commit. (#9427)

Might make the test less flaky.
---
 test_runner/regress/test_logical_replication.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 87991eadf1..c26bf058e2 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -558,10 +558,10 @@ select sent_lsn, flush_lsn, pg_current_wal_flush_lsn() from pg_stat_replication
     return publisher_flush_lsn
 
 
-# Test that subscriber takes into account quorum committed flush_lsn in
-# flush_lsn reporting to publisher. Without this, it may ack too far, losing
-# data on restart because publisher advances START_REPLICATION position to the
-# confirmed_flush_lsn of the slot.
+# Test that neon subscriber takes into account quorum committed flush_lsn in
+# flush_lsn reporting to publisher. Without this, subscriber may ack too far,
+# losing data on restart because publisher implicitly advances positition given
+# in START_REPLICATION to the confirmed_flush_lsn of the slot.
 def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
     # use vanilla as publisher to allow writes on it when safekeeper is down
@@ -578,7 +578,10 @@ def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg):
     vanilla_pg.safe_psql("create extension neon;")
 
     env.create_branch("subscriber")
-    sub = env.endpoints.create("subscriber")
+    # We want all data to fit into shared_buffers because later we stop
+    # safekeeper and insert more; this shouldn't cause page requests as they
+    # will be stuck.
+    sub = env.endpoints.create("subscriber", config_lines=["shared_buffers=128MB"])
     sub.start()
 
     with vanilla_pg.cursor() as pcur:

From 15fecffe6ba400693619c6a022ed6205769a61ae Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Fri, 18 Oct 2024 12:42:41 +0200
Subject: [PATCH 18/48] Update ruff to much newer version (#9433)

Includes a multidict patch release to fix build with newer cpython.
---
 poetry.lock                                   | 207 ++++++++++--------
 pyproject.toml                                |   2 +-
 test_runner/fixtures/neon_cli.py              |   4 +-
 test_runner/fixtures/neon_fixtures.py         |  18 +-
 test_runner/fixtures/utils.py                 |   2 +-
 .../performance/test_logical_replication.py   |  14 +-
 .../performance/test_physical_replication.py  |  12 +-
 .../regress/test_download_extensions.py       |   2 +-
 test_runner/regress/test_next_xid.py          |   4 +-
 test_runner/regress/test_timeline_delete.py   |   2 +-
 10 files changed, 145 insertions(+), 122 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 00fe2505c9..e307b873f3 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -1758,85 +1758,101 @@ tests = ["pytest (>=4.6)"]
 
 [[package]]
 name = "multidict"
-version = "6.0.4"
+version = "6.0.5"
 description = "multidict implementation"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8"},
-    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eeb6dcc05e911516ae3d1f207d4b0520d07f54484c49dfc294d6e7d63b734171"},
-    {file = "multidict-6.0.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d6d635d5209b82a3492508cf5b365f3446afb65ae7ebd755e70e18f287b0adf7"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c048099e4c9e9d615545e2001d3d8a4380bd403e1a0578734e0d31703d1b0c0b"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea20853c6dbbb53ed34cb4d080382169b6f4554d394015f1bef35e881bf83547"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:16d232d4e5396c2efbbf4f6d4df89bfa905eb0d4dc5b3549d872ab898451f569"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36c63aaa167f6c6b04ef2c85704e93af16c11d20de1d133e39de6a0e84582a93"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:64bdf1086b6043bf519869678f5f2757f473dee970d7abf6da91ec00acb9cb98"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:43644e38f42e3af682690876cff722d301ac585c5b9e1eacc013b7a3f7b696a0"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7582a1d1030e15422262de9f58711774e02fa80df0d1578995c76214f6954988"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:ddff9c4e225a63a5afab9dd15590432c22e8057e1a9a13d28ed128ecf047bbdc"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ee2a1ece51b9b9e7752e742cfb661d2a29e7bcdba2d27e66e28a99f1890e4fa0"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a2e4369eb3d47d2034032a26c7a80fcb21a2cb22e1173d761a162f11e562caa5"},
-    {file = "multidict-6.0.4-cp310-cp310-win32.whl", hash = "sha256:574b7eae1ab267e5f8285f0fe881f17efe4b98c39a40858247720935b893bba8"},
-    {file = "multidict-6.0.4-cp310-cp310-win_amd64.whl", hash = "sha256:4dcbb0906e38440fa3e325df2359ac6cb043df8e58c965bb45f4e406ecb162cc"},
-    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0dfad7a5a1e39c53ed00d2dd0c2e36aed4650936dc18fd9a1826a5ae1cad6f03"},
-    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:64da238a09d6039e3bd39bb3aee9c21a5e34f28bfa5aa22518581f910ff94af3"},
-    {file = "multidict-6.0.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ff959bee35038c4624250473988b24f846cbeb2c6639de3602c073f10410ceba"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01a3a55bd90018c9c080fbb0b9f4891db37d148a0a18722b42f94694f8b6d4c9"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c5cb09abb18c1ea940fb99360ea0396f34d46566f157122c92dfa069d3e0e982"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:666daae833559deb2d609afa4490b85830ab0dfca811a98b70a205621a6109fe"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11bdf3f5e1518b24530b8241529d2050014c884cf18b6fc69c0c2b30ca248710"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d18748f2d30f94f498e852c67d61261c643b349b9d2a581131725595c45ec6c"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:458f37be2d9e4c95e2d8866a851663cbc76e865b78395090786f6cd9b3bbf4f4"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b1a2eeedcead3a41694130495593a559a668f382eee0727352b9a41e1c45759a"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7d6ae9d593ef8641544d6263c7fa6408cc90370c8cb2bbb65f8d43e5b0351d9c"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5979b5632c3e3534e42ca6ff856bb24b2e3071b37861c2c727ce220d80eee9ed"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dcfe792765fab89c365123c81046ad4103fcabbc4f56d1c1997e6715e8015461"},
-    {file = "multidict-6.0.4-cp311-cp311-win32.whl", hash = "sha256:3601a3cece3819534b11d4efc1eb76047488fddd0c85a3948099d5da4d504636"},
-    {file = "multidict-6.0.4-cp311-cp311-win_amd64.whl", hash = "sha256:81a4f0b34bd92df3da93315c6a59034df95866014ac08535fc819f043bfd51f0"},
-    {file = "multidict-6.0.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:67040058f37a2a51ed8ea8f6b0e6ee5bd78ca67f169ce6122f3e2ec80dfe9b78"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:853888594621e6604c978ce2a0444a1e6e70c8d253ab65ba11657659dcc9100f"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:39ff62e7d0f26c248b15e364517a72932a611a9b75f35b45be078d81bdb86603"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:af048912e045a2dc732847d33821a9d84ba553f5c5f028adbd364dd4765092ac"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1e8b901e607795ec06c9e42530788c45ac21ef3aaa11dbd0c69de543bfb79a9"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62501642008a8b9871ddfccbf83e4222cf8ac0d5aeedf73da36153ef2ec222d2"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:99b76c052e9f1bc0721f7541e5e8c05db3941eb9ebe7b8553c625ef88d6eefde"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:509eac6cf09c794aa27bcacfd4d62c885cce62bef7b2c3e8b2e49d365b5003fe"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:21a12c4eb6ddc9952c415f24eef97e3e55ba3af61f67c7bc388dcdec1404a067"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:5cad9430ab3e2e4fa4a2ef4450f548768400a2ac635841bc2a56a2052cdbeb87"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ab55edc2e84460694295f401215f4a58597f8f7c9466faec545093045476327d"},
-    {file = "multidict-6.0.4-cp37-cp37m-win32.whl", hash = "sha256:5a4dcf02b908c3b8b17a45fb0f15b695bf117a67b76b7ad18b73cf8e92608775"},
-    {file = "multidict-6.0.4-cp37-cp37m-win_amd64.whl", hash = "sha256:6ed5f161328b7df384d71b07317f4d8656434e34591f20552c7bcef27b0ab88e"},
-    {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5fc1b16f586f049820c5c5b17bb4ee7583092fa0d1c4e28b5239181ff9532e0c"},
-    {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1502e24330eb681bdaa3eb70d6358e818e8e8f908a22a1851dfd4e15bc2f8161"},
-    {file = "multidict-6.0.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b692f419760c0e65d060959df05f2a531945af31fda0c8a3b3195d4efd06de11"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45e1ecb0379bfaab5eef059f50115b54571acfbe422a14f668fc8c27ba410e7e"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ddd3915998d93fbcd2566ddf9cf62cdb35c9e093075f862935573d265cf8f65d"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:59d43b61c59d82f2effb39a93c48b845efe23a3852d201ed2d24ba830d0b4cf2"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc8e1d0c705233c5dd0c5e6460fbad7827d5d36f310a0fadfd45cc3029762258"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6aa0418fcc838522256761b3415822626f866758ee0bc6632c9486b179d0b52"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6748717bb10339c4760c1e63da040f5f29f5ed6e59d76daee30305894069a660"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4d1a3d7ef5e96b1c9e92f973e43aa5e5b96c659c9bc3124acbbd81b0b9c8a951"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4372381634485bec7e46718edc71528024fcdc6f835baefe517b34a33c731d60"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:fc35cb4676846ef752816d5be2193a1e8367b4c1397b74a565a9d0389c433a1d"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:4b9d9e4e2b37daddb5c23ea33a3417901fa7c7b3dee2d855f63ee67a0b21e5b1"},
-    {file = "multidict-6.0.4-cp38-cp38-win32.whl", hash = "sha256:e41b7e2b59679edfa309e8db64fdf22399eec4b0b24694e1b2104fb789207779"},
-    {file = "multidict-6.0.4-cp38-cp38-win_amd64.whl", hash = "sha256:d6c254ba6e45d8e72739281ebc46ea5eb5f101234f3ce171f0e9f5cc86991480"},
-    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:16ab77bbeb596e14212e7bab8429f24c1579234a3a462105cda4a66904998664"},
-    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc779e9e6f7fda81b3f9aa58e3a6091d49ad528b11ed19f6621408806204ad35"},
-    {file = "multidict-6.0.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ceef517eca3e03c1cceb22030a3e39cb399ac86bff4e426d4fc6ae49052cc60"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:281af09f488903fde97923c7744bb001a9b23b039a909460d0f14edc7bf59706"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:52f2dffc8acaba9a2f27174c41c9e57f60b907bb9f096b36b1a1f3be71c6284d"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b41156839806aecb3641f3208c0dafd3ac7775b9c4c422d82ee2a45c34ba81ca"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5e3fc56f88cc98ef8139255cf8cd63eb2c586531e43310ff859d6bb3a6b51f1"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8316a77808c501004802f9beebde51c9f857054a0c871bd6da8280e718444449"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f70b98cd94886b49d91170ef23ec5c0e8ebb6f242d734ed7ed677b24d50c82cf"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bf6774e60d67a9efe02b3616fee22441d86fab4c6d335f9d2051d19d90a40063"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:e69924bfcdda39b722ef4d9aa762b2dd38e4632b3641b1d9a57ca9cd18f2f83a"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:6b181d8c23da913d4ff585afd1155a0e1194c0b50c54fcfe286f70cdaf2b7176"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:52509b5be062d9eafc8170e53026fbc54cf3b32759a23d07fd935fb04fc22d95"},
-    {file = "multidict-6.0.4-cp39-cp39-win32.whl", hash = "sha256:27c523fbfbdfd19c6867af7346332b62b586eed663887392cff78d614f9ec313"},
-    {file = "multidict-6.0.4-cp39-cp39-win_amd64.whl", hash = "sha256:33029f5734336aa0d4c0384525da0387ef89148dc7191aae00ca5fb23d7aafc2"},
-    {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"},
+    {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:228b644ae063c10e7f324ab1ab6b548bdf6f8b47f3ec234fef1093bc2735e5f9"},
+    {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:896ebdcf62683551312c30e20614305f53125750803b614e9e6ce74a96232604"},
+    {file = "multidict-6.0.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:411bf8515f3be9813d06004cac41ccf7d1cd46dfe233705933dd163b60e37600"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d147090048129ce3c453f0292e7697d333db95e52616b3793922945804a433c"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:215ed703caf15f578dca76ee6f6b21b7603791ae090fbf1ef9d865571039ade5"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c6390cf87ff6234643428991b7359b5f59cc15155695deb4eda5c777d2b880f"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21fd81c4ebdb4f214161be351eb5bcf385426bf023041da2fd9e60681f3cebae"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3cc2ad10255f903656017363cd59436f2111443a76f996584d1077e43ee51182"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6939c95381e003f54cd4c5516740faba40cf5ad3eeff460c3ad1d3e0ea2549bf"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:220dd781e3f7af2c2c1053da9fa96d9cf3072ca58f057f4c5adaaa1cab8fc442"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:766c8f7511df26d9f11cd3a8be623e59cca73d44643abab3f8c8c07620524e4a"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c1c1496e73051918fcd4f58ff2e0f2f3066d1c76a0c6aeffd9b45d53243702cc"},
+    {file = "multidict-6.0.5-cp310-cp310-win32.whl", hash = "sha256:7afcdd1fc07befad18ec4523a782cde4e93e0a2bf71239894b8d61ee578c1319"},
+    {file = "multidict-6.0.5-cp310-cp310-win_amd64.whl", hash = "sha256:99f60d34c048c5c2fabc766108c103612344c46e35d4ed9ae0673d33c8fb26e8"},
+    {file = "multidict-6.0.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f285e862d2f153a70586579c15c44656f888806ed0e5b56b64489afe4a2dbfba"},
+    {file = "multidict-6.0.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:53689bb4e102200a4fafa9de9c7c3c212ab40a7ab2c8e474491914d2305f187e"},
+    {file = "multidict-6.0.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:612d1156111ae11d14afaf3a0669ebf6c170dbb735e510a7438ffe2369a847fd"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7be7047bd08accdb7487737631d25735c9a04327911de89ff1b26b81745bd4e3"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de170c7b4fe6859beb8926e84f7d7d6c693dfe8e27372ce3b76f01c46e489fcf"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:04bde7a7b3de05732a4eb39c94574db1ec99abb56162d6c520ad26f83267de29"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85f67aed7bb647f93e7520633d8f51d3cbc6ab96957c71272b286b2f30dc70ed"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:425bf820055005bfc8aa9a0b99ccb52cc2f4070153e34b701acc98d201693733"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d3eb1ceec286eba8220c26f3b0096cf189aea7057b6e7b7a2e60ed36b373b77f"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:7901c05ead4b3fb75113fb1dd33eb1253c6d3ee37ce93305acd9d38e0b5f21a4"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e0e79d91e71b9867c73323a3444724d496c037e578a0e1755ae159ba14f4f3d1"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:29bfeb0dff5cb5fdab2023a7a9947b3b4af63e9c47cae2a10ad58394b517fddc"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e030047e85cbcedbfc073f71836d62dd5dadfbe7531cae27789ff66bc551bd5e"},
+    {file = "multidict-6.0.5-cp311-cp311-win32.whl", hash = "sha256:2f4848aa3baa109e6ab81fe2006c77ed4d3cd1e0ac2c1fbddb7b1277c168788c"},
+    {file = "multidict-6.0.5-cp311-cp311-win_amd64.whl", hash = "sha256:2faa5ae9376faba05f630d7e5e6be05be22913782b927b19d12b8145968a85ea"},
+    {file = "multidict-6.0.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:51d035609b86722963404f711db441cf7134f1889107fb171a970c9701f92e1e"},
+    {file = "multidict-6.0.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:cbebcd5bcaf1eaf302617c114aa67569dd3f090dd0ce8ba9e35e9985b41ac35b"},
+    {file = "multidict-6.0.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2ffc42c922dbfddb4a4c3b438eb056828719f07608af27d163191cb3e3aa6cc5"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ceb3b7e6a0135e092de86110c5a74e46bda4bd4fbfeeb3a3bcec79c0f861e450"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:79660376075cfd4b2c80f295528aa6beb2058fd289f4c9252f986751a4cd0496"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e4428b29611e989719874670fd152b6625500ad6c686d464e99f5aaeeaca175a"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d84a5c3a5f7ce6db1f999fb9438f686bc2e09d38143f2d93d8406ed2dd6b9226"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:76c0de87358b192de7ea9649beb392f107dcad9ad27276324c24c91774ca5271"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:79a6d2ba910adb2cbafc95dad936f8b9386e77c84c35bc0add315b856d7c3abb"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:92d16a3e275e38293623ebf639c471d3e03bb20b8ebb845237e0d3664914caef"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:fb616be3538599e797a2017cccca78e354c767165e8858ab5116813146041a24"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:14c2976aa9038c2629efa2c148022ed5eb4cb939e15ec7aace7ca932f48f9ba6"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:435a0984199d81ca178b9ae2c26ec3d49692d20ee29bc4c11a2a8d4514c67eda"},
+    {file = "multidict-6.0.5-cp312-cp312-win32.whl", hash = "sha256:9fe7b0653ba3d9d65cbe7698cca585bf0f8c83dbbcc710db9c90f478e175f2d5"},
+    {file = "multidict-6.0.5-cp312-cp312-win_amd64.whl", hash = "sha256:01265f5e40f5a17f8241d52656ed27192be03bfa8764d88e8220141d1e4b3556"},
+    {file = "multidict-6.0.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:19fe01cea168585ba0f678cad6f58133db2aa14eccaf22f88e4a6dccadfad8b3"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6bf7a982604375a8d49b6cc1b781c1747f243d91b81035a9b43a2126c04766f5"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:107c0cdefe028703fb5dafe640a409cb146d44a6ae201e55b35a4af8e95457dd"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:403c0911cd5d5791605808b942c88a8155c2592e05332d2bf78f18697a5fa15e"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aeaf541ddbad8311a87dd695ed9642401131ea39ad7bc8cf3ef3967fd093b626"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e4972624066095e52b569e02b5ca97dbd7a7ddd4294bf4e7247d52635630dd83"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d946b0a9eb8aaa590df1fe082cee553ceab173e6cb5b03239716338629c50c7a"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b55358304d7a73d7bdf5de62494aaf70bd33015831ffd98bc498b433dfe5b10c"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:a3145cb08d8625b2d3fee1b2d596a8766352979c9bffe5d7833e0503d0f0b5e5"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d65f25da8e248202bd47445cec78e0025c0fe7582b23ec69c3b27a640dd7a8e3"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:c9bf56195c6bbd293340ea82eafd0071cb3d450c703d2c93afb89f93b8386ccc"},
+    {file = "multidict-6.0.5-cp37-cp37m-win32.whl", hash = "sha256:69db76c09796b313331bb7048229e3bee7928eb62bab5e071e9f7fcc4879caee"},
+    {file = "multidict-6.0.5-cp37-cp37m-win_amd64.whl", hash = "sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423"},
+    {file = "multidict-6.0.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:76f067f5121dcecf0d63a67f29080b26c43c71a98b10c701b0677e4a065fbd54"},
+    {file = "multidict-6.0.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b82cc8ace10ab5bd93235dfaab2021c70637005e1ac787031f4d1da63d493c1d"},
+    {file = "multidict-6.0.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5cb241881eefd96b46f89b1a056187ea8e9ba14ab88ba632e68d7a2ecb7aadf7"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8e94e6912639a02ce173341ff62cc1201232ab86b8a8fcc05572741a5dc7d93"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:09a892e4a9fb47331da06948690ae38eaa2426de97b4ccbfafbdcbe5c8f37ff8"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55205d03e8a598cfc688c71ca8ea5f66447164efff8869517f175ea632c7cb7b"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37b15024f864916b4951adb95d3a80c9431299080341ab9544ed148091b53f50"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2a1dee728b52b33eebff5072817176c172050d44d67befd681609b4746e1c2e"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:edd08e6f2f1a390bf137080507e44ccc086353c8e98c657e666c017718561b89"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:60d698e8179a42ec85172d12f50b1668254628425a6bd611aba022257cac1386"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:3d25f19500588cbc47dc19081d78131c32637c25804df8414463ec908631e453"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:4cc0ef8b962ac7a5e62b9e826bd0cd5040e7d401bc45a6835910ed699037a461"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:eca2e9d0cc5a889850e9bbd68e98314ada174ff6ccd1129500103df7a94a7a44"},
+    {file = "multidict-6.0.5-cp38-cp38-win32.whl", hash = "sha256:4a6a4f196f08c58c59e0b8ef8ec441d12aee4125a7d4f4fef000ccb22f8d7241"},
+    {file = "multidict-6.0.5-cp38-cp38-win_amd64.whl", hash = "sha256:0275e35209c27a3f7951e1ce7aaf93ce0d163b28948444bec61dd7badc6d3f8c"},
+    {file = "multidict-6.0.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e7be68734bd8c9a513f2b0cfd508802d6609da068f40dc57d4e3494cefc92929"},
+    {file = "multidict-6.0.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1d9ea7a7e779d7a3561aade7d596649fbecfa5c08a7674b11b423783217933f9"},
+    {file = "multidict-6.0.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ea1456df2a27c73ce51120fa2f519f1bea2f4a03a917f4a43c8707cf4cbbae1a"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf590b134eb70629e350691ecca88eac3e3b8b3c86992042fb82e3cb1830d5e1"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5c0631926c4f58e9a5ccce555ad7747d9a9f8b10619621f22f9635f069f6233e"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dce1c6912ab9ff5f179eaf6efe7365c1f425ed690b03341911bf4939ef2f3046"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0868d64af83169e4d4152ec612637a543f7a336e4a307b119e98042e852ad9c"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:141b43360bfd3bdd75f15ed811850763555a251e38b2405967f8e25fb43f7d40"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7df704ca8cf4a073334e0427ae2345323613e4df18cc224f647f251e5e75a527"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6214c5a5571802c33f80e6c84713b2c79e024995b9c5897f794b43e714daeec9"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:cd6c8fca38178e12c00418de737aef1261576bd1b6e8c6134d3e729a4e858b38"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e02021f87a5b6932fa6ce916ca004c4d441509d33bbdbeca70d05dff5e9d2479"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ebd8d160f91a764652d3e51ce0d2956b38efe37c9231cd82cfc0bed2e40b581c"},
+    {file = "multidict-6.0.5-cp39-cp39-win32.whl", hash = "sha256:04da1bb8c8dbadf2a18a452639771951c662c5ad03aefe4884775454be322c9b"},
+    {file = "multidict-6.0.5-cp39-cp39-win_amd64.whl", hash = "sha256:d6f6d4f185481c9669b9447bf9d9cf3b95a0e9df9d169bbc17e363b7d5487755"},
+    {file = "multidict-6.0.5-py3-none-any.whl", hash = "sha256:0d63c74e3d7ab26de115c49bffc92cc77ed23395303d496eae515d4204a625e7"},
+    {file = "multidict-6.0.5.tar.gz", hash = "sha256:f7e301075edaf50500f0b341543c41194d8df3ae5caf4702f2095f3ca73dd8da"},
 ]
 
 [[package]]
@@ -2766,28 +2782,29 @@ six = "*"
 
 [[package]]
 name = "ruff"
-version = "0.2.2"
+version = "0.7.0"
 description = "An extremely fast Python linter and code formatter, written in Rust."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:0a9efb032855ffb3c21f6405751d5e147b0c6b631e3ca3f6b20f917572b97eb6"},
-    {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:d450b7fbff85913f866a5384d8912710936e2b96da74541c82c1b458472ddb39"},
-    {file = "ruff-0.2.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecd46e3106850a5c26aee114e562c329f9a1fbe9e4821b008c4404f64ff9ce73"},
-    {file = "ruff-0.2.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e22676a5b875bd72acd3d11d5fa9075d3a5f53b877fe7b4793e4673499318ba"},
-    {file = "ruff-0.2.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1695700d1e25a99d28f7a1636d85bafcc5030bba9d0578c0781ba1790dbcf51c"},
-    {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:b0c232af3d0bd8f521806223723456ffebf8e323bd1e4e82b0befb20ba18388e"},
-    {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f63d96494eeec2fc70d909393bcd76c69f35334cdbd9e20d089fb3f0640216ca"},
-    {file = "ruff-0.2.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a61ea0ff048e06de273b2e45bd72629f470f5da8f71daf09fe481278b175001"},
-    {file = "ruff-0.2.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e1439c8f407e4f356470e54cdecdca1bd5439a0673792dbe34a2b0a551a2fe3"},
-    {file = "ruff-0.2.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:940de32dc8853eba0f67f7198b3e79bc6ba95c2edbfdfac2144c8235114d6726"},
-    {file = "ruff-0.2.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0c126da55c38dd917621552ab430213bdb3273bb10ddb67bc4b761989210eb6e"},
-    {file = "ruff-0.2.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:3b65494f7e4bed2e74110dac1f0d17dc8e1f42faaa784e7c58a98e335ec83d7e"},
-    {file = "ruff-0.2.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1ec49be4fe6ddac0503833f3ed8930528e26d1e60ad35c2446da372d16651ce9"},
-    {file = "ruff-0.2.2-py3-none-win32.whl", hash = "sha256:d920499b576f6c68295bc04e7b17b6544d9d05f196bb3aac4358792ef6f34325"},
-    {file = "ruff-0.2.2-py3-none-win_amd64.whl", hash = "sha256:cc9a91ae137d687f43a44c900e5d95e9617cb37d4c989e462980ba27039d239d"},
-    {file = "ruff-0.2.2-py3-none-win_arm64.whl", hash = "sha256:c9d15fc41e6054bfc7200478720570078f0b41c9ae4f010bcc16bd6f4d1aacdd"},
-    {file = "ruff-0.2.2.tar.gz", hash = "sha256:e62ed7f36b3068a30ba39193a14274cd706bc486fad521276458022f7bccb31d"},
+    {file = "ruff-0.7.0-py3-none-linux_armv6l.whl", hash = "sha256:0cdf20c2b6ff98e37df47b2b0bd3a34aaa155f59a11182c1303cce79be715628"},
+    {file = "ruff-0.7.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:496494d350c7fdeb36ca4ef1c9f21d80d182423718782222c29b3e72b3512737"},
+    {file = "ruff-0.7.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:214b88498684e20b6b2b8852c01d50f0651f3cc6118dfa113b4def9f14faaf06"},
+    {file = "ruff-0.7.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:630fce3fefe9844e91ea5bbf7ceadab4f9981f42b704fae011bb8efcaf5d84be"},
+    {file = "ruff-0.7.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:211d877674e9373d4bb0f1c80f97a0201c61bcd1e9d045b6e9726adc42c156aa"},
+    {file = "ruff-0.7.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:194d6c46c98c73949a106425ed40a576f52291c12bc21399eb8f13a0f7073495"},
+    {file = "ruff-0.7.0-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:82c2579b82b9973a110fab281860403b397c08c403de92de19568f32f7178598"},
+    {file = "ruff-0.7.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9af971fe85dcd5eaed8f585ddbc6bdbe8c217fb8fcf510ea6bca5bdfff56040e"},
+    {file = "ruff-0.7.0-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b641c7f16939b7d24b7bfc0be4102c56562a18281f84f635604e8a6989948914"},
+    {file = "ruff-0.7.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d71672336e46b34e0c90a790afeac8a31954fd42872c1f6adaea1dff76fd44f9"},
+    {file = "ruff-0.7.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:ab7d98c7eed355166f367597e513a6c82408df4181a937628dbec79abb2a1fe4"},
+    {file = "ruff-0.7.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:1eb54986f770f49edb14f71d33312d79e00e629a57387382200b1ef12d6a4ef9"},
+    {file = "ruff-0.7.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:dc452ba6f2bb9cf8726a84aa877061a2462afe9ae0ea1d411c53d226661c601d"},
+    {file = "ruff-0.7.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:4b406c2dce5be9bad59f2de26139a86017a517e6bcd2688da515481c05a2cb11"},
+    {file = "ruff-0.7.0-py3-none-win32.whl", hash = "sha256:f6c968509f767776f524a8430426539587d5ec5c662f6addb6aa25bc2e8195ec"},
+    {file = "ruff-0.7.0-py3-none-win_amd64.whl", hash = "sha256:ff4aabfbaaba880e85d394603b9e75d32b0693152e16fa659a3064a85df7fce2"},
+    {file = "ruff-0.7.0-py3-none-win_arm64.whl", hash = "sha256:10842f69c245e78d6adec7e1db0a7d9ddc2fff0621d730e61657b64fa36f207e"},
+    {file = "ruff-0.7.0.tar.gz", hash = "sha256:47a86360cf62d9cd53ebfb0b5eb0e882193fc191c6d717e8bef4462bc3b9ea2b"},
 ]
 
 [[package]]
@@ -3389,4 +3406,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "9055b73352f1534f664cd8af6ebf8d93cf3bf857f115756f312ff2e3ae1bbbc1"
+content-hash = "f52632571e34b0e51b059c280c35d6ff6f69f6a8c9586caca78282baf635be91"
diff --git a/pyproject.toml b/pyproject.toml
index 9cd315bb96..862ed49638 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,7 +45,7 @@ kafka-python = "^2.0.2"
 
 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
-ruff = "^0.2.2"
+ruff = "^0.7.0"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index 0d3dcd1671..1b2767e296 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import abc
 import json
 import os
 import re
@@ -30,7 +29,8 @@ if TYPE_CHECKING:
     T = TypeVar("T")
 
 
-class AbstractNeonCli(abc.ABC):
+# Used to be an ABC. abc.ABC removed due to linter without name change.
+class AbstractNeonCli:
     """
     A typed wrapper around an arbitrary Neon CLI tool.
     Supports a way to run arbitrary command directly via CLI.
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a313ac2ed3..3cd8019e32 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -386,9 +386,9 @@ class NeonEnvBuilder:
 
         self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine
 
-        self.pageserver_default_tenant_config_compaction_algorithm: Optional[
-            dict[str, Any]
-        ] = pageserver_default_tenant_config_compaction_algorithm
+        self.pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]] = (
+            pageserver_default_tenant_config_compaction_algorithm
+        )
         if self.pageserver_default_tenant_config_compaction_algorithm is not None:
             log.debug(
                 f"Overriding pageserver default compaction algorithm to {self.pageserver_default_tenant_config_compaction_algorithm}"
@@ -1062,9 +1062,9 @@ class NeonEnv:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
             if config.pageserver_default_tenant_config_compaction_algorithm is not None:
                 tenant_config = ps_cfg.setdefault("tenant_config", {})
-                tenant_config[
-                    "compaction_algorithm"
-                ] = config.pageserver_default_tenant_config_compaction_algorithm
+                tenant_config["compaction_algorithm"] = (
+                    config.pageserver_default_tenant_config_compaction_algorithm
+                )
 
             if self.pageserver_remote_storage is not None:
                 ps_cfg["remote_storage"] = remote_storage_to_toml_dict(
@@ -1108,9 +1108,9 @@ class NeonEnv:
             if config.auth_enabled:
                 sk_cfg["auth_enabled"] = True
             if self.safekeepers_remote_storage is not None:
-                sk_cfg[
-                    "remote_storage"
-                ] = self.safekeepers_remote_storage.to_toml_inline_table().strip()
+                sk_cfg["remote_storage"] = (
+                    self.safekeepers_remote_storage.to_toml_inline_table().strip()
+                )
             self.safekeepers.append(
                 Safekeeper(env=self, id=id, port=port, extra_opts=config.safekeeper_extra_opts)
             )
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 76575d330c..7ca6b3dd1c 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -417,7 +417,7 @@ def wait_until(
             time.sleep(interval)
             continue
         return res
-    raise Exception("timed out while waiting for %s" % func) from last_exception
+    raise Exception(f"timed out while waiting for {func}") from last_exception
 
 
 def assert_eq(a, b) -> None:
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index dbf94a2cf5..815d186ab9 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -144,9 +144,10 @@ def test_subscriber_lag(
                 check_pgbench_still_running(pub_workload, "pub")
                 check_pgbench_still_running(sub_workload, "sub")
 
-                with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
-                    sub_connstr
-                ) as sub_conn:
+                with (
+                    psycopg2.connect(pub_connstr) as pub_conn,
+                    psycopg2.connect(sub_connstr) as sub_conn,
+                ):
                     with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
                         lag = measure_logical_replication_lag(sub_cur, pub_cur)
 
@@ -242,9 +243,10 @@ def test_publisher_restart(
                     ["pgbench", "-c10", pgbench_duration, "-Mprepared"],
                     env=pub_env,
                 )
-                with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
-                    sub_connstr
-                ) as sub_conn:
+                with (
+                    psycopg2.connect(pub_connstr) as pub_conn,
+                    psycopg2.connect(sub_connstr) as sub_conn,
+                ):
                     with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
                         lag = measure_logical_replication_lag(sub_cur, pub_cur)
 
diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py
index 14b527acca..8b368977df 100644
--- a/test_runner/performance/test_physical_replication.py
+++ b/test_runner/performance/test_physical_replication.py
@@ -102,10 +102,14 @@ def test_ro_replica_lag(
                     check_pgbench_still_running(master_workload)
                     check_pgbench_still_running(replica_workload)
                     time.sleep(sync_interval_min * 60)
-                    with psycopg2.connect(master_connstr) as conn_master, psycopg2.connect(
-                        replica_connstr
-                    ) as conn_replica:
-                        with conn_master.cursor() as cur_master, conn_replica.cursor() as cur_replica:
+                    with (
+                        psycopg2.connect(master_connstr) as conn_master,
+                        psycopg2.connect(replica_connstr) as conn_replica,
+                    ):
+                        with (
+                            conn_master.cursor() as cur_master,
+                            conn_replica.cursor() as cur_replica,
+                        ):
                             lag = measure_replication_lag(cur_master, cur_replica)
                     log.info(f"Replica lagged behind master by {lag} seconds")
                     zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index 04916a6b6f..0134f80769 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -74,7 +74,7 @@ def test_remote_extensions(
             mimetype="application/octet-stream",
             headers=[
                 ("Content-Length", str(file_size)),
-                ("Content-Disposition", 'attachment; filename="%s"' % file_name),
+                ("Content-Disposition", f'attachment; filename="{file_name}"'),
             ],
             direct_passthrough=True,
         )
diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py
index 980f6b5694..db8da51125 100644
--- a/test_runner/regress/test_next_xid.py
+++ b/test_runner/regress/test_next_xid.py
@@ -254,13 +254,13 @@ def advance_multixid_to(
     # missing. That's OK for our purposes. Autovacuum will print some warnings about the
     # missing segments, but will clean it up by truncating the SLRUs up to the new value,
     # closing the gap.
-    segname = "%04X" % MultiXactIdToOffsetSegment(next_multi_xid)
+    segname = f"{MultiXactIdToOffsetSegment(next_multi_xid):04X}"
     log.info(f"Creating dummy segment pg_multixact/offsets/{segname}")
     with open(vanilla_pg.pgdatadir / "pg_multixact" / "offsets" / segname, "w") as of:
         of.write("\0" * SLRU_PAGES_PER_SEGMENT * BLCKSZ)
         of.flush()
 
-    segname = "%04X" % MXOffsetToMemberSegment(next_multi_offset)
+    segname = f"{MXOffsetToMemberSegment(next_multi_offset):04X}"
     log.info(f"Creating dummy segment pg_multixact/members/{segname}")
     with open(vanilla_pg.pgdatadir / "pg_multixact" / "members" / segname, "w") as of:
         of.write("\0" * SLRU_PAGES_PER_SEGMENT * BLCKSZ)
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 306f22acf9..155709e106 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -649,7 +649,7 @@ def test_timeline_delete_works_for_remote_smoke(
     env = neon_env_builder.init_start()
 
     ps_http = env.pageserver.http_client()
-    pg = env.endpoints.create_start("main")
+    env.endpoints.create_start("main")
 
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline

From 3532ae76ef3a91131aee1f203a133c4d5e32b57a Mon Sep 17 00:00:00 2001
From: Jere Vaara <jerevaara@gmail.com>
Date: Fri, 18 Oct 2024 15:07:36 +0300
Subject: [PATCH 19/48] compute_ctl: Add endpoint that allows extensions to be
 installed (#9344)

Adds endpoint to install extensions:

**POST** `/extensions`
```
{"extension":"pg_sessions_jwt","database":"neondb","version":"1.0.0"}
```

Will be used by `local-proxy`.
Example, for the JWT authentication to work the database needs to have
the pg_session_jwt extension and also to enable JWT to work in RLS
policies.

---------

Co-authored-by: Conrad Ludgate <conradludgate@gmail.com>
---
 compute_tools/src/compute.rs             | 52 +++++++++++++++++-
 compute_tools/src/http/api.rs            | 37 ++++++++++++-
 compute_tools/src/http/openapi_spec.yaml | 69 +++++++++++++++++++++++-
 libs/compute_api/src/requests.rs         | 10 +++-
 libs/compute_api/src/responses.rs        |  7 ++-
 libs/compute_api/src/spec.rs             |  3 ++
 test_runner/fixtures/endpoint/http.py    | 10 ++++
 test_runner/regress/test_extensions.py   | 50 +++++++++++++++++
 8 files changed, 231 insertions(+), 7 deletions(-)
 create mode 100644 test_runner/regress/test_extensions.py

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 11fee73f03..c9dd4dcfc5 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -28,7 +28,7 @@ use utils::lsn::Lsn;
 
 use compute_api::privilege::Privilege;
 use compute_api::responses::{ComputeMetrics, ComputeStatus};
-use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
+use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec, ExtVersion};
 use utils::measured_stream::MeasuredReader;
 
 use nix::sys::signal::{kill, Signal};
@@ -1416,6 +1416,56 @@ LIMIT 100",
         Ok(())
     }
 
+    pub async fn install_extension(
+        &self,
+        ext_name: &PgIdent,
+        db_name: &PgIdent,
+        ext_version: ExtVersion,
+    ) -> Result<ExtVersion> {
+        use tokio_postgres::config::Config;
+        use tokio_postgres::NoTls;
+
+        let mut conf = Config::from_str(self.connstr.as_str()).unwrap();
+        conf.dbname(db_name);
+
+        let (db_client, conn) = conf
+            .connect(NoTls)
+            .await
+            .context("Failed to connect to the database")?;
+        tokio::spawn(conn);
+
+        let version_query = "SELECT extversion FROM pg_extension WHERE extname = $1";
+        let version: Option<ExtVersion> = db_client
+            .query_opt(version_query, &[&ext_name])
+            .await
+            .with_context(|| format!("Failed to execute query: {}", version_query))?
+            .map(|row| row.get(0));
+
+        // sanitize the inputs as postgres idents.
+        let ext_name: String = ext_name.pg_quote();
+        let quoted_version: String = ext_version.pg_quote();
+
+        if let Some(installed_version) = version {
+            if installed_version == ext_version {
+                return Ok(installed_version);
+            }
+            let query = format!("ALTER EXTENSION {ext_name} UPDATE TO {quoted_version}");
+            db_client
+                .simple_query(&query)
+                .await
+                .with_context(|| format!("Failed to execute query: {}", query))?;
+        } else {
+            let query =
+                format!("CREATE EXTENSION IF NOT EXISTS {ext_name} WITH VERSION {quoted_version}");
+            db_client
+                .simple_query(&query)
+                .await
+                .with_context(|| format!("Failed to execute query: {}", query))?;
+        }
+
+        Ok(ext_version)
+    }
+
     #[tokio::main]
     pub async fn prepare_preload_libraries(
         &self,
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index 133ab9f5af..af35f71bf2 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -9,9 +9,10 @@ use crate::catalog::SchemaDumpError;
 use crate::catalog::{get_database_schema, get_dbs_and_roles};
 use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
-use compute_api::requests::{ConfigurationRequest, SetRoleGrantsRequest};
+use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest};
 use compute_api::responses::{
-    ComputeStatus, ComputeStatusResponse, GenericAPIError, SetRoleGrantsResponse,
+    ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError,
+    SetRoleGrantsResponse,
 };
 
 use anyhow::Result;
@@ -100,6 +101,38 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
             }
         }
 
+        (&Method::POST, "/extensions") => {
+            info!("serving /extensions POST request");
+            let status = compute.get_status();
+            if status != ComputeStatus::Running {
+                let msg = format!(
+                    "invalid compute status for extensions request: {:?}",
+                    status
+                );
+                error!(msg);
+                return render_json_error(&msg, StatusCode::PRECONDITION_FAILED);
+            }
+
+            let request = hyper::body::to_bytes(req.into_body()).await.unwrap();
+            let request = serde_json::from_slice::<ExtensionInstallRequest>(&request).unwrap();
+            let res = compute
+                .install_extension(&request.extension, &request.database, request.version)
+                .await;
+            match res {
+                Ok(version) => render_json(Body::from(
+                    serde_json::to_string(&ExtensionInstallResult {
+                        extension: request.extension,
+                        version,
+                    })
+                    .unwrap(),
+                )),
+                Err(e) => {
+                    error!("install_extension failed: {}", e);
+                    render_json_error(&e.to_string(), StatusCode::INTERNAL_SERVER_ERROR)
+                }
+            }
+        }
+
         (&Method::GET, "/info") => {
             let num_cpus = num_cpus::get_physical();
             info!("serving /info GET request. num_cpus: {}", num_cpus);
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index 73dbdc3ee9..11eee6ccfd 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -179,6 +179,41 @@ paths:
                 description: Error text or 'true' if check passed.
                 example: "true"
 
+  /extensions:
+    post:
+      tags:
+        - Extensions
+      summary: Install extension if possible.
+      description: ""
+      operationId: installExtension
+      requestBody:
+        description: Extension name and database to install it to.
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/ExtensionInstallRequest"
+      responses:
+        200:
+          description: Result from extension installation
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ExtensionInstallResult"
+        412:
+          description: |
+            Compute is in the wrong state for processing the request.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+        500:
+          description: Error during extension installation.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+
   /configure:
     post:
       tags:
@@ -404,7 +439,7 @@ components:
             moment, when spec was received.
           example: "2022-10-12T07:20:50.52Z"
         status:
-          $ref: '#/components/schemas/ComputeStatus'
+          $ref: "#/components/schemas/ComputeStatus"
         last_active:
           type: string
           description: |
@@ -444,6 +479,38 @@ components:
         - configuration
       example: running
 
+    ExtensionInstallRequest:
+      type: object
+      required:
+        - extension
+        - database
+        - version
+      properties:
+        extension:
+          type: string
+          description: Extension name.
+          example: "pg_session_jwt"
+        version:
+          type: string
+          description: Version of the extension.
+          example: "1.0.0"
+        database:
+          type: string
+          description: Database name.
+          example: "neondb"
+
+    ExtensionInstallResult:
+      type: object
+      properties:
+        extension:
+          description: Name of the extension.
+          type: string
+          example: "pg_session_jwt"
+        version:
+          description: Version of the extension.
+          type: string
+          example: "1.0.0"
+
     InstalledExtensions:
       type: object
       properties:
diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs
index fbc7577dd9..fc3757d981 100644
--- a/libs/compute_api/src/requests.rs
+++ b/libs/compute_api/src/requests.rs
@@ -1,8 +1,7 @@
 //! Structs representing the JSON formats used in the compute_ctl's HTTP API.
-
 use crate::{
     privilege::Privilege,
-    spec::{ComputeSpec, PgIdent},
+    spec::{ComputeSpec, ExtVersion, PgIdent},
 };
 use serde::Deserialize;
 
@@ -16,6 +15,13 @@ pub struct ConfigurationRequest {
     pub spec: ComputeSpec,
 }
 
+#[derive(Deserialize, Debug)]
+pub struct ExtensionInstallRequest {
+    pub extension: PgIdent,
+    pub database: PgIdent,
+    pub version: ExtVersion,
+}
+
 #[derive(Deserialize, Debug)]
 pub struct SetRoleGrantsRequest {
     pub database: PgIdent,
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index fadf524273..79234be720 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize, Serializer};
 
 use crate::{
     privilege::Privilege,
-    spec::{ComputeSpec, Database, PgIdent, Role},
+    spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role},
 };
 
 #[derive(Serialize, Debug, Deserialize)]
@@ -172,6 +172,11 @@ pub struct InstalledExtensions {
     pub extensions: Vec<InstalledExtension>,
 }
 
+#[derive(Clone, Debug, Default, Serialize)]
+pub struct ExtensionInstallResult {
+    pub extension: PgIdent,
+    pub version: ExtVersion,
+}
 #[derive(Clone, Debug, Default, Serialize)]
 pub struct SetRoleGrantsResponse {
     pub database: PgIdent,
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 5903db7055..8a447563dc 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -16,6 +16,9 @@ use remote_storage::RemotePath;
 /// intended to be used for DB / role names.
 pub type PgIdent = String;
 
+/// String type alias representing Postgres extension version
+pub type ExtVersion = String;
+
 /// Cluster spec or configuration represented as an optional number of
 /// delta operations + final cluster state description.
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py
index e7b014b4a9..ea8291c1e0 100644
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -29,6 +29,16 @@ class EndpointHttpClient(requests.Session):
         res.raise_for_status()
         return res.json()
 
+    def extensions(self, extension: str, version: str, database: str):
+        body = {
+            "extension": extension,
+            "version": version,
+            "database": database,
+        }
+        res = self.post(f"http://localhost:{self.port}/extensions", json=body)
+        res.raise_for_status()
+        return res.json()
+
     def set_role_grants(self, database: str, role: str, schema: str, privileges: list[str]):
         res = self.post(
             f"http://localhost:{self.port}/grants",
diff --git a/test_runner/regress/test_extensions.py b/test_runner/regress/test_extensions.py
new file mode 100644
index 0000000000..100fd4b048
--- /dev/null
+++ b/test_runner/regress/test_extensions.py
@@ -0,0 +1,50 @@
+from logging import info
+
+from fixtures.neon_fixtures import NeonEnv
+
+
+def test_extensions(neon_simple_env: NeonEnv):
+    """basic test for the extensions endpoint testing installing extensions"""
+
+    env = neon_simple_env
+
+    env.create_branch("test_extensions")
+
+    endpoint = env.endpoints.create_start("test_extensions")
+    extension = "neon_test_utils"
+    database = "test_extensions"
+
+    endpoint.safe_psql("CREATE DATABASE test_extensions")
+
+    with endpoint.connect(dbname=database) as pg_conn:
+        with pg_conn.cursor() as cur:
+            cur.execute(
+                "SELECT default_version FROM pg_available_extensions WHERE name = 'neon_test_utils'"
+            )
+            res = cur.fetchone()
+            assert res is not None
+            version = res[0]
+
+        with pg_conn.cursor() as cur:
+            cur.execute(
+                "SELECT extname, extversion FROM pg_extension WHERE extname = 'neon_test_utils'",
+            )
+            res = cur.fetchone()
+            assert not res, "The 'neon_test_utils' extension is installed"
+
+    client = endpoint.http_client()
+    install_res = client.extensions(extension, version, database)
+
+    info("Extension install result: %s", res)
+    assert install_res["extension"] == extension and install_res["version"] == version
+
+    with endpoint.connect(dbname=database) as pg_conn:
+        with pg_conn.cursor() as cur:
+            cur.execute(
+                "SELECT extname, extversion FROM pg_extension WHERE extname = 'neon_test_utils'",
+            )
+            res = cur.fetchone()
+            assert res is not None
+            (db_extension_name, db_extension_version) = res
+
+    assert db_extension_name == extension and db_extension_version == version

From fecff15f18f00a692ff234106b064d1693cc5441 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Fri, 18 Oct 2024 15:31:50 +0300
Subject: [PATCH 20/48] walproposer: immediately exit if sync-safekeepers
 collected 0/0. (#9442)

Otherwise term history starting with 0/0 is streamed to safekeepers.

ref https://github.com/neondatabase/neon/issues/9434
---
 pgxn/neon/walproposer.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index a3f33cb261..d2a6104c74 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -841,6 +841,23 @@ HandleElectedProposer(WalProposer *wp)
 		wp_log(FATAL, "failed to download WAL for logical replicaiton");
 	}
 
+	/*
+	 * Zero propEpochStartLsn means majority of safekeepers doesn't have any
+	 * WAL, timeline was just created. Compute bumps it to basebackup LSN,
+	 * otherwise we must be sync-safekeepers and we have nothing to do then.
+	 *
+	 * Proceeding is not only pointless but harmful, because we'd give
+	 * safekeepers term history starting with 0/0. These hacks will go away once
+	 * we disable implicit timeline creation on safekeepers and create it with
+	 * non zero LSN from the start.
+	 */
+	if (wp->propEpochStartLsn == InvalidXLogRecPtr)
+	{
+		Assert(wp->config->syncSafekeepers);
+		wp_log(LOG, "elected with zero propEpochStartLsn in sync-safekeepers, exiting");
+		wp->api.finish_sync_safekeepers(wp, wp->propEpochStartLsn);
+	}
+
 	if (wp->truncateLsn == wp->propEpochStartLsn && wp->config->syncSafekeepers)
 	{
 		/* Sync is not needed: just exit */

From ec6d3422a5a7b6f537b029d7c3e70b7a60f99e0c Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 18 Oct 2024 13:38:59 +0100
Subject: [PATCH 21/48] pageserver: disconnect when asking client to reconnect
 (#9390)

## Problem

Consider the following sequence of events:
1. Shard location gets downgraded to secondary while there's a libpq
connection in pagestream mode from the compute
2. There's no active tenant, so we return `QueryError::Reconnect` from
`PageServerHandler::handle_get_page_at_lsn_request`.
3. Error bubbles up to `PostgresBackendIO::process_message`, bailing us
out of pagestream mode.
4. We instruct the client to reconnnect, but continue serving the libpq
connection. The client isn't yet aware of the request to reconnect and
believes it is still in pagestream mode. Pageserver fails to deserialize
get page requests wrapped in `CopyData` since it's not in pagestream
mode.

## Summary of Changes

When we wish to instruct the client to reconnect, also disconnect from
the server side after flushing the error.

Closes https://github.com/neondatabase/cloud/issues/17336
---
 libs/postgres_backend/src/lib.rs | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 9d274b25e6..7419798a60 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -738,6 +738,20 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                         QueryError::SimulatedConnectionError => {
                             return Err(QueryError::SimulatedConnectionError)
                         }
+                        err @ QueryError::Reconnect => {
+                            // Instruct the client to reconnect, stop processing messages
+                            // from this libpq connection and, finally, disconnect from the
+                            // server side (returning an Err achieves the later).
+                            //
+                            // Note the flushing is done by the caller.
+                            let reconnect_error = short_error(&err);
+                            self.write_message_noflush(&BeMessage::ErrorResponse(
+                                &reconnect_error,
+                                Some(err.pg_error_code()),
+                            ))?;
+
+                            return Err(err);
+                        }
                         e => {
                             log_query_error(query_string, &e);
                             let short_error = short_error(&e);

From 5cbdec9c794ef414e7511d644450b1a9a944d4ff Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 18 Oct 2024 14:41:21 +0100
Subject: [PATCH 22/48] [local_proxy]: install pg_session_jwt extension on
 demand (#9370)

Follow up on #9344. We want to install the extension automatically. We
didn't want to couple the extension into compute_ctl so instead
local_proxy is the one to issue requests specific to the extension.

depends on #9344 and #9395
---
 compute/Dockerfile.compute-node         |   4 +-
 proxy/src/auth/backend/local.rs         |  13 ++-
 proxy/src/bin/local_proxy.rs            |   8 +-
 proxy/src/compute_ctl/mod.rs            | 101 ++++++++++++++++++++++++
 proxy/src/http/mod.rs                   |  13 ++-
 proxy/src/lib.rs                        |   1 +
 proxy/src/serverless/backend.rs         |  54 +++++++++++--
 proxy/src/serverless/local_conn_pool.rs |  57 +++++++++----
 8 files changed, 222 insertions(+), 29 deletions(-)
 create mode 100644 proxy/src/compute_ctl/mod.rs

diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node
index 45c1fd9f38..74970696b5 100644
--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -975,8 +975,8 @@ ARG PG_VERSION
 RUN case "${PG_VERSION}" in "v17") \
     echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \
     esac && \
-    wget https://github.com/neondatabase/pg_session_jwt/archive/5aee2625af38213650e1a07ae038fdc427250ee4.tar.gz -O pg_session_jwt.tar.gz && \
-    echo "5d91b10bc1347d36cffc456cb87bec25047935d6503dc652ca046f04760828e7 pg_session_jwt.tar.gz" | sha256sum --check && \
+    wget https://github.com/neondatabase/pg_session_jwt/archive/e642528f429dd3f5403845a50191b78d434b84a6.tar.gz -O pg_session_jwt.tar.gz && \
+    echo "1a69210703cc91224785e59a0a67562dd9eed9a0914ac84b11447582ca0d5b93 pg_session_jwt.tar.gz" | sha256sum --check && \
     mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
index e3995ac6c0..1e029ff609 100644
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -1,23 +1,32 @@
 use std::net::SocketAddr;
 
 use arc_swap::ArcSwapOption;
+use tokio::sync::Semaphore;
 
 use super::jwt::{AuthRule, FetchAuthRules};
 use crate::auth::backend::jwt::FetchAuthRulesError;
 use crate::compute::ConnCfg;
+use crate::compute_ctl::ComputeCtlApi;
 use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo};
 use crate::control_plane::NodeInfo;
 use crate::intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag};
-use crate::EndpointId;
+use crate::url::ApiUrl;
+use crate::{http, EndpointId};
 
 pub struct LocalBackend {
+    pub(crate) initialize: Semaphore,
+    pub(crate) compute_ctl: ComputeCtlApi,
     pub(crate) node_info: NodeInfo,
 }
 
 impl LocalBackend {
-    pub fn new(postgres_addr: SocketAddr) -> Self {
+    pub fn new(postgres_addr: SocketAddr, compute_ctl: ApiUrl) -> Self {
         LocalBackend {
+            initialize: Semaphore::new(1),
+            compute_ctl: ComputeCtlApi {
+                api: http::Endpoint::new(compute_ctl, http::new_client()),
+            },
             node_info: NodeInfo {
                 config: {
                     let mut cfg = ConnCfg::new();
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index e6bc369d9a..a16c288e5d 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -25,6 +25,7 @@ use proxy::rate_limiter::{
 use proxy::scram::threadpool::ThreadPool;
 use proxy::serverless::cancel_set::CancelSet;
 use proxy::serverless::{self, GlobalConnPoolOptions};
+use proxy::url::ApiUrl;
 use proxy::RoleName;
 
 project_git_version!(GIT_VERSION);
@@ -80,7 +81,10 @@ struct LocalProxyCliArgs {
     connect_to_compute_retry: String,
     /// Address of the postgres server
     #[clap(long, default_value = "127.0.0.1:5432")]
-    compute: SocketAddr,
+    postgres: SocketAddr,
+    /// Address of the compute-ctl api service
+    #[clap(long, default_value = "http://127.0.0.1:3080/")]
+    compute_ctl: ApiUrl,
     /// Path of the local proxy config file
     #[clap(long, default_value = "./local_proxy.json")]
     config_path: Utf8PathBuf,
@@ -295,7 +299,7 @@ fn build_auth_backend(
     args: &LocalProxyCliArgs,
 ) -> anyhow::Result<&'static auth::Backend<'static, ()>> {
     let auth_backend = proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned(
-        LocalBackend::new(args.compute),
+        LocalBackend::new(args.postgres, args.compute_ctl.clone()),
     ));
 
     Ok(Box::leak(Box::new(auth_backend)))
diff --git a/proxy/src/compute_ctl/mod.rs b/proxy/src/compute_ctl/mod.rs
new file mode 100644
index 0000000000..2b57897223
--- /dev/null
+++ b/proxy/src/compute_ctl/mod.rs
@@ -0,0 +1,101 @@
+use compute_api::responses::GenericAPIError;
+use hyper::{Method, StatusCode};
+use serde::de::DeserializeOwned;
+use serde::{Deserialize, Serialize};
+use thiserror::Error;
+
+use crate::url::ApiUrl;
+use crate::{http, DbName, RoleName};
+
+pub struct ComputeCtlApi {
+    pub(crate) api: http::Endpoint,
+}
+
+#[derive(Serialize, Debug)]
+pub struct ExtensionInstallRequest {
+    pub extension: &'static str,
+    pub database: DbName,
+    pub version: &'static str,
+}
+
+#[derive(Serialize, Debug)]
+pub struct SetRoleGrantsRequest {
+    pub database: DbName,
+    pub schema: &'static str,
+    pub privileges: Vec<Privilege>,
+    pub role: RoleName,
+}
+
+#[derive(Clone, Debug, Deserialize)]
+pub struct ExtensionInstallResponse {}
+
+#[derive(Clone, Debug, Deserialize)]
+pub struct SetRoleGrantsResponse {}
+
+#[derive(Debug, Serialize, Deserialize, Clone, Copy)]
+#[serde(rename_all = "UPPERCASE")]
+pub enum Privilege {
+    Usage,
+}
+
+#[derive(Error, Debug)]
+pub enum ComputeCtlError {
+    #[error("connection error: {0}")]
+    ConnectionError(#[source] reqwest_middleware::Error),
+    #[error("request error [{status}]: {body:?}")]
+    RequestError {
+        status: StatusCode,
+        body: Option<GenericAPIError>,
+    },
+    #[error("response parsing error: {0}")]
+    ResponseError(#[source] reqwest::Error),
+}
+
+impl ComputeCtlApi {
+    pub async fn install_extension(
+        &self,
+        req: &ExtensionInstallRequest,
+    ) -> Result<ExtensionInstallResponse, ComputeCtlError> {
+        self.generic_request(req, Method::POST, |url| {
+            url.path_segments_mut().push("extensions");
+        })
+        .await
+    }
+
+    pub async fn grant_role(
+        &self,
+        req: &SetRoleGrantsRequest,
+    ) -> Result<SetRoleGrantsResponse, ComputeCtlError> {
+        self.generic_request(req, Method::POST, |url| {
+            url.path_segments_mut().push("grants");
+        })
+        .await
+    }
+
+    async fn generic_request<Req, Resp>(
+        &self,
+        req: &Req,
+        method: Method,
+        url: impl for<'a> FnOnce(&'a mut ApiUrl),
+    ) -> Result<Resp, ComputeCtlError>
+    where
+        Req: Serialize,
+        Resp: DeserializeOwned,
+    {
+        let resp = self
+            .api
+            .request_with_url(method, url)
+            .json(req)
+            .send()
+            .await
+            .map_err(ComputeCtlError::ConnectionError)?;
+
+        let status = resp.status();
+        if status.is_client_error() || status.is_server_error() {
+            let body = resp.json().await.ok();
+            return Err(ComputeCtlError::RequestError { status, body });
+        }
+
+        resp.json().await.map_err(ComputeCtlError::ResponseError)
+    }
+}
diff --git a/proxy/src/http/mod.rs b/proxy/src/http/mod.rs
index fd587e8f01..f1b632e704 100644
--- a/proxy/src/http/mod.rs
+++ b/proxy/src/http/mod.rs
@@ -8,6 +8,7 @@ use std::time::Duration;
 
 use anyhow::bail;
 use bytes::Bytes;
+use http::Method;
 use http_body_util::BodyExt;
 use hyper::body::Body;
 pub(crate) use reqwest::{Request, Response};
@@ -93,9 +94,19 @@ impl Endpoint {
     /// Return a [builder](RequestBuilder) for a `GET` request,
     /// accepting a closure to modify the url path segments for more complex paths queries.
     pub(crate) fn get_with_url(&self, f: impl for<'a> FnOnce(&'a mut ApiUrl)) -> RequestBuilder {
+        self.request_with_url(Method::GET, f)
+    }
+
+    /// Return a [builder](RequestBuilder) for a request,
+    /// accepting a closure to modify the url path segments for more complex paths queries.
+    pub(crate) fn request_with_url(
+        &self,
+        method: Method,
+        f: impl for<'a> FnOnce(&'a mut ApiUrl),
+    ) -> RequestBuilder {
         let mut url = self.endpoint.clone();
         f(&mut url);
-        self.client.get(url.into_inner())
+        self.client.request(method, url.into_inner())
     }
 
     /// Execute a [request](reqwest::Request).
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index a7b3d45c95..ea17a88067 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -90,6 +90,7 @@ pub mod auth;
 pub mod cache;
 pub mod cancellation;
 pub mod compute;
+pub mod compute_ctl;
 pub mod config;
 pub mod console_redirect_proxy;
 pub mod context;
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 82e81dbcfe..5d59b4d252 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -14,10 +14,13 @@ use tracing::{debug, info};
 use super::conn_pool::poll_client;
 use super::conn_pool_lib::{Client, ConnInfo, GlobalConnPool};
 use super::http_conn_pool::{self, poll_http2_client, Send};
-use super::local_conn_pool::{self, LocalClient, LocalConnPool};
+use super::local_conn_pool::{self, LocalClient, LocalConnPool, EXT_NAME, EXT_SCHEMA, EXT_VERSION};
 use crate::auth::backend::local::StaticAuthRules;
 use crate::auth::backend::{ComputeCredentials, ComputeUserInfo};
 use crate::auth::{self, check_peer_addr_is_in_list, AuthError};
+use crate::compute_ctl::{
+    ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest,
+};
 use crate::config::ProxyConfig;
 use crate::context::RequestMonitoring;
 use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError};
@@ -35,6 +38,7 @@ pub(crate) struct PoolingBackend {
     pub(crate) http_conn_pool: Arc<super::http_conn_pool::GlobalConnPool<Send>>,
     pub(crate) local_pool: Arc<LocalConnPool<tokio_postgres::Client>>,
     pub(crate) pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
+
     pub(crate) config: &'static ProxyConfig,
     pub(crate) auth_backend: &'static crate::auth::Backend<'static, ()>,
     pub(crate) endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -250,16 +254,47 @@ impl PoolingBackend {
             return Ok(client);
         }
 
+        let local_backend = match &self.auth_backend {
+            auth::Backend::ControlPlane(_, ()) => {
+                unreachable!("only local_proxy can connect to local postgres")
+            }
+            auth::Backend::Local(local) => local,
+        };
+
+        if !self.local_pool.initialized(&conn_info) {
+            // only install and grant usage one at a time.
+            let _permit = local_backend.initialize.acquire().await.unwrap();
+
+            // check again for race
+            if !self.local_pool.initialized(&conn_info) {
+                local_backend
+                    .compute_ctl
+                    .install_extension(&ExtensionInstallRequest {
+                        extension: EXT_NAME,
+                        database: conn_info.dbname.clone(),
+                        version: EXT_VERSION,
+                    })
+                    .await?;
+
+                local_backend
+                    .compute_ctl
+                    .grant_role(&SetRoleGrantsRequest {
+                        schema: EXT_SCHEMA,
+                        privileges: vec![Privilege::Usage],
+                        database: conn_info.dbname.clone(),
+                        role: conn_info.user_info.user.clone(),
+                    })
+                    .await?;
+
+                self.local_pool.set_initialized(&conn_info);
+            }
+        }
+
         let conn_id = uuid::Uuid::new_v4();
         tracing::Span::current().record("conn_id", display(conn_id));
         info!(%conn_id, "local_pool: opening a new connection '{conn_info}'");
 
-        let mut node_info = match &self.auth_backend {
-            auth::Backend::ControlPlane(_, ()) => {
-                unreachable!("only local_proxy can connect to local postgres")
-            }
-            auth::Backend::Local(local) => local.node_info.clone(),
-        };
+        let mut node_info = local_backend.node_info.clone();
 
         let (key, jwk) = create_random_jwk();
 
@@ -324,6 +359,8 @@ pub(crate) enum HttpConnError {
     #[error("could not parse JWT payload")]
     JwtPayloadError(serde_json::Error),
 
+    #[error("could not install extension: {0}")]
+    ComputeCtl(#[from] ComputeCtlError),
     #[error("could not get auth info")]
     GetAuthInfo(#[from] GetAuthInfoError),
     #[error("user not authenticated")]
@@ -348,6 +385,7 @@ impl ReportableError for HttpConnError {
             HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute,
             HttpConnError::PostgresConnectionError(p) => p.get_error_kind(),
             HttpConnError::LocalProxyConnectionError(_) => ErrorKind::Compute,
+            HttpConnError::ComputeCtl(_) => ErrorKind::Service,
             HttpConnError::JwtPayloadError(_) => ErrorKind::User,
             HttpConnError::GetAuthInfo(a) => a.get_error_kind(),
             HttpConnError::AuthError(a) => a.get_error_kind(),
@@ -363,6 +401,7 @@ impl UserFacingError for HttpConnError {
             HttpConnError::ConnectionClosedAbruptly(_) => self.to_string(),
             HttpConnError::PostgresConnectionError(p) => p.to_string(),
             HttpConnError::LocalProxyConnectionError(p) => p.to_string(),
+            HttpConnError::ComputeCtl(_) => "could not set up the JWT authorization database extension".to_string(),
             HttpConnError::JwtPayloadError(p) => p.to_string(),
             HttpConnError::GetAuthInfo(c) => c.to_string_client(),
             HttpConnError::AuthError(c) => c.to_string_client(),
@@ -379,6 +418,7 @@ impl CouldRetry for HttpConnError {
         match self {
             HttpConnError::PostgresConnectionError(e) => e.could_retry(),
             HttpConnError::LocalProxyConnectionError(e) => e.could_retry(),
+            HttpConnError::ComputeCtl(_) => false,
             HttpConnError::ConnectionClosedAbruptly(_) => false,
             HttpConnError::JwtPayloadError(_) => false,
             HttpConnError::GetAuthInfo(_) => false,
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index a01afd2820..beb2ad4e8f 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -1,3 +1,14 @@
+//! Manages the pool of connections between local_proxy and postgres.
+//!
+//! The pool is keyed by database and role_name, and can contain multiple connections
+//! shared between users.
+//!
+//! The pool manages the pg_session_jwt extension used for authorizing
+//! requests in the db.
+//!
+//! The first time a db/role pair is seen, local_proxy attempts to install the extension
+//! and grant usage to the role on the given schema.
+
 use std::collections::HashMap;
 use std::pin::pin;
 use std::sync::{Arc, Weak};
@@ -27,14 +38,15 @@ use crate::metrics::Metrics;
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
 use crate::{DbName, RoleName};
 
+pub(crate) const EXT_NAME: &str = "pg_session_jwt";
+pub(crate) const EXT_VERSION: &str = "0.1.1";
+pub(crate) const EXT_SCHEMA: &str = "auth";
+
 struct ConnPoolEntry<C: ClientInnerExt> {
     conn: ClientInner<C>,
     _last_access: std::time::Instant,
 }
 
-// /// key id for the pg_session_jwt state
-// static PG_SESSION_JWT_KID: AtomicU64 = AtomicU64::new(1);
-
 // Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
 // Number of open connections is limited by the `max_conns_per_endpoint`.
 pub(crate) struct EndpointConnPool<C: ClientInnerExt> {
@@ -140,11 +152,18 @@ impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
 
 pub(crate) struct DbUserConnPool<C: ClientInnerExt> {
     conns: Vec<ConnPoolEntry<C>>,
+
+    // true if we have definitely installed the extension and
+    // granted the role access to the auth schema.
+    initialized: bool,
 }
 
 impl<C: ClientInnerExt> Default for DbUserConnPool<C> {
     fn default() -> Self {
-        Self { conns: Vec::new() }
+        Self {
+            conns: Vec::new(),
+            initialized: false,
+        }
     }
 }
 
@@ -199,25 +218,16 @@ impl<C: ClientInnerExt> LocalConnPool<C> {
         self.config.pool_options.idle_timeout
     }
 
-    // pub(crate) fn shutdown(&self) {
-    //     let mut pool = self.global_pool.write();
-    //     pool.pools.clear();
-    //     pool.total_conns = 0;
-    // }
-
     pub(crate) fn get(
         self: &Arc<Self>,
         ctx: &RequestMonitoring,
         conn_info: &ConnInfo,
     ) -> Result<Option<LocalClient<C>>, HttpConnError> {
-        let mut client: Option<ClientInner<C>> = None;
-        if let Some(entry) = self
+        let client = self
             .global_pool
             .write()
             .get_conn_entry(conn_info.db_and_user())
-        {
-            client = Some(entry.conn);
-        }
+            .map(|entry| entry.conn);
 
         // ok return cached connection if found and establish a new one otherwise
         if let Some(client) = client {
@@ -245,6 +255,23 @@ impl<C: ClientInnerExt> LocalConnPool<C> {
         }
         Ok(None)
     }
+
+    pub(crate) fn initialized(self: &Arc<Self>, conn_info: &ConnInfo) -> bool {
+        self.global_pool
+            .read()
+            .pools
+            .get(&conn_info.db_and_user())
+            .map_or(false, |pool| pool.initialized)
+    }
+
+    pub(crate) fn set_initialized(self: &Arc<Self>, conn_info: &ConnInfo) {
+        self.global_pool
+            .write()
+            .pools
+            .entry(conn_info.db_and_user())
+            .or_default()
+            .initialized = true;
+    }
 }
 
 #[allow(clippy::too_many_arguments)]

From e162ab8b536e8b1d2277b4e2c00abd574c394d75 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 18 Oct 2024 15:33:04 +0100
Subject: [PATCH 23/48] storcon: handle ongoing deletions gracefully (#9449)

## Problem

Pageserver returns 409 (Conflict) if any of the shards are already
deleting the timeline. This resulted in an error being propagated out of
the HTTP handler and to the client. It's an expected scenario so we
should handle it nicely.

This caused failures in `test_storage_controller_smoke`
[here](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9435/11390431900/index.html#suites/8fc5d1648d2225380766afde7c428d81/86eee4b002d6572d).

## Summary of Changes

Instead of returning an error on 409s, we now bubble the status code up
and let the HTTP handler code retry until it gets a 404 or times out.
---
 storage_controller/src/http.rs    | 18 ++++++++++++------
 storage_controller/src/service.rs | 29 +++++++++++++++++++++--------
 2 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 46b6f4f2bf..afefe8598c 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -381,14 +381,16 @@ async fn handle_tenant_timeline_delete(
         R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
         F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
     {
+        // On subsequent retries, wait longer.
+        // Enable callers with a 25 second request timeout to reliably get a response
+        const MAX_WAIT: Duration = Duration::from_secs(25);
+        const MAX_RETRY_PERIOD: Duration = Duration::from_secs(5);
+
         let started_at = Instant::now();
+
         // To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
         // completed.
         let mut retry_period = Duration::from_secs(1);
-        // On subsequent retries, wait longer.
-        let max_retry_period = Duration::from_secs(5);
-        // Enable callers with a 30 second request timeout to reliably get a response
-        let max_wait = Duration::from_secs(25);
 
         loop {
             let status = f(service.clone()).await?;
@@ -396,7 +398,11 @@ async fn handle_tenant_timeline_delete(
                 StatusCode::ACCEPTED => {
                     tracing::info!("Deletion accepted, waiting to try again...");
                     tokio::time::sleep(retry_period).await;
-                    retry_period = max_retry_period;
+                    retry_period = MAX_RETRY_PERIOD;
+                }
+                StatusCode::CONFLICT => {
+                    tracing::info!("Deletion already in progress, waiting to try again...");
+                    tokio::time::sleep(retry_period).await;
                 }
                 StatusCode::NOT_FOUND => {
                     tracing::info!("Deletion complete");
@@ -409,7 +415,7 @@ async fn handle_tenant_timeline_delete(
             }
 
             let now = Instant::now();
-            if now + retry_period > started_at + max_wait {
+            if now + retry_period > started_at + MAX_WAIT {
                 tracing::info!("Deletion timed out waiting for 404");
                 // REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
                 // the pageserver's swagger definition for this endpoint, and has the same desired
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index ab2c3b5e48..01aa8f1dab 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -3630,14 +3630,21 @@ impl Service {
                 );
 
                 let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
-                client
+                let res = client
                     .timeline_delete(tenant_shard_id, timeline_id)
-                    .await
-                    .map_err(|e| {
-                        ApiError::InternalServerError(anyhow::anyhow!(
-                            "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
-                        ))
-                    })
+                    .await;
+
+                match res {
+                    Ok(ok) => Ok(ok),
+                    Err(mgmt_api::Error::ApiError(StatusCode::CONFLICT, _)) => Ok(StatusCode::CONFLICT),
+                    Err(e) => {
+                        Err(
+                            ApiError::InternalServerError(anyhow::anyhow!(
+                                "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
+                            ))
+                        )
+                    }
+                }
             }
 
             let locations = targets.0.iter().map(|t| (*t.0, t.1.latest.node.clone())).collect();
@@ -3652,7 +3659,13 @@ impl Service {
                 })
                 .await?;
 
-            // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero
+            // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero.
+            // We return 409 (Conflict) if deletion was already in progress on any of the shards
+            // and 202 (Accepted) if deletion was not already in progress on any of the shards.
+            if statuses.iter().any(|s| s == &StatusCode::CONFLICT) {
+                return Ok(StatusCode::CONFLICT);
+            }
+
             if statuses.iter().any(|s| s != &StatusCode::NOT_FOUND) {
                 return Ok(StatusCode::ACCEPTED);
             }

From 62a334871fef32b754ab98a772ebbbbed8d1aa1c Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 18 Oct 2024 09:36:29 -0500
Subject: [PATCH 24/48] Take the collector name as argument when generating
 sql_exporter configs

In neon_collector_autoscaling.jsonnet, the collector name is hardcoded
to neon_collector_autoscaling. This issue manifests itself such that
sql_exporter would not find the collector configuration.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/Makefile                 | 2 ++
 compute/etc/sql_exporter.jsonnet | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/compute/Makefile b/compute/Makefile
index e4f08a223c..e2896fe390 100644
--- a/compute/Makefile
+++ b/compute/Makefile
@@ -20,12 +20,14 @@ neon_collector_autoscaling.yml: $(jsonnet_files)
 sql_exporter.yml: $(jsonnet_files)
 	JSONNET_PATH=etc jsonnet \
 		--output-file etc/$@ \
+		--tla-str collector_name=neon_collector \
 		--tla-str collector_file=neon_collector.yml \
 		etc/sql_exporter.jsonnet
 
 sql_exporter_autoscaling.yml: $(jsonnet_files)
 	JSONNET_PATH=etc jsonnet \
 		--output-file etc/$@ \
+		--tla-str collector_name=neon_collector_autoscaling \
 		--tla-str collector_file=neon_collector_autoscaling.yml \
 		--tla-str application_name=sql_exporter_autoscaling \
 		etc/sql_exporter.jsonnet
diff --git a/compute/etc/sql_exporter.jsonnet b/compute/etc/sql_exporter.jsonnet
index 640e2ac38d..3c36fd4f68 100644
--- a/compute/etc/sql_exporter.jsonnet
+++ b/compute/etc/sql_exporter.jsonnet
@@ -1,4 +1,4 @@
-function(collector_file, application_name='sql_exporter') {
+function(collector_name, collector_file, application_name='sql_exporter') {
   // Configuration for sql_exporter for autoscaling-agent
   // Global defaults.
   global: {
@@ -28,7 +28,7 @@ function(collector_file, application_name='sql_exporter') {
     // Collectors (referenced by name) to execute on the target.
     // Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
     collectors: [
-      'neon_collector',
+      collector_name,
     ],
   },
 

From 71d09c78d4ffd159cfcd83c4c1b919a4c7eef7c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sat, 19 Oct 2024 00:23:49 +0200
Subject: [PATCH 25/48] Accept basebackup <tenant> <timeline> --gzip requests
 (#9456)

In #9453, we want to remove the non-gzipped basebackup code in the
computes, and always request gzipped basebackups.

However, right now the pageserver's page service only accepts basebackup
requests in the following formats:

* `basebackup <tenant_id> <timeline_id>`, lsn is determined by the
pageserver as the most recent one (`timeline.get_last_record_rlsn()`)
* `basebackup <tenant_id> <timeline_id> <lsn>`
* `basebackup <tenant_id> <timeline_id> <lsn> --gzip`

We add a fourth case, `basebackup <tenant_id> <timeline_id> --gzip` to
allow gzipping the request for the latest lsn as well.
---
 pageserver/src/page_service.rs | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index afb2f92ff8..62b14cb83e 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1326,22 +1326,22 @@ where
                 .for_command(ComputeCommandKind::Basebackup)
                 .inc();
 
-            let lsn = if let Some(lsn_str) = params.get(2) {
-                Some(
-                    Lsn::from_str(lsn_str)
-                        .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
-                )
-            } else {
-                None
-            };
-
-            let gzip = match params.get(3) {
-                Some(&"--gzip") => true,
-                None => false,
-                Some(third_param) => {
-                    return Err(QueryError::Other(anyhow::anyhow!(
-                        "Parameter in position 3 unknown {third_param}",
-                    )))
+            let (lsn, gzip) = match (params.get(2), params.get(3)) {
+                (None, _) => (None, false),
+                (Some(&"--gzip"), _) => (None, true),
+                (Some(lsn_str), gzip_str_opt) => {
+                    let lsn = Lsn::from_str(lsn_str)
+                        .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?;
+                    let gzip = match gzip_str_opt {
+                        Some(&"--gzip") => true,
+                        None => false,
+                        Some(third_param) => {
+                            return Err(QueryError::Other(anyhow::anyhow!(
+                                "Parameter in position 3 unknown {third_param}",
+                            )))
+                        }
+                    };
+                    (Some(lsn), gzip)
                 }
             };
 

From cc25ef73423ea0108986436501481b0154443932 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Sun, 20 Oct 2024 13:42:50 +0100
Subject: [PATCH 26/48] bump pg-session-jwt version (#9455)

forgot to bump this before
---
 proxy/src/serverless/local_conn_pool.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index beb2ad4e8f..e1ad46c751 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -39,7 +39,7 @@ use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
 use crate::{DbName, RoleName};
 
 pub(crate) const EXT_NAME: &str = "pg_session_jwt";
-pub(crate) const EXT_VERSION: &str = "0.1.1";
+pub(crate) const EXT_VERSION: &str = "0.1.2";
 pub(crate) const EXT_SCHEMA: &str = "auth";
 
 struct ConnPoolEntry<C: ClientInnerExt> {

From ed958da38a0edf7853ee999f43737ac2ff69f920 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 21 Oct 2024 10:29:23 +0200
Subject: [PATCH 27/48] proxy: Make tests fail fast when test proxy exited
 early (#9432)

This currently happens when proxy is not compiled with feature
`testing`.
Also fix an adjacent function.
---
 test_runner/fixtures/neon_fixtures.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 3cd8019e32..747c2c0d63 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3175,10 +3175,13 @@ class NeonProxy(PgProtocol):
     # two seconds. Raises subprocess.TimeoutExpired if the proxy does not exit in time.
     def wait_for_exit(self, timeout=2):
         if self._popen:
-            self._popen.wait(timeout=2)
+            self._popen.wait(timeout=timeout)
 
     @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10)
     def _wait_until_ready(self):
+        assert (
+            self._popen and self._popen.poll() is None
+        ), "Proxy exited unexpectedly. Check test log."
         requests.get(f"http://{self.host}:{self.http_port}/v1/status")
 
     def http_query(self, query, args, **kwargs):

From 5b37485c99836abb060bed8eb1172870b31504b2 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 21 Oct 2024 09:51:12 +0100
Subject: [PATCH 28/48] Rename dockerfiles from `Dockerfile.<something>` to
 `<something>.Dockerfile` (#9446)

## Problem

Our dockerfiles, for some historical reason, have unconventional names
`Dockerfile.<something>`, and some tools (like GitHub UI) fail to highlight
the syntax in them.

> Some projects may need distinct Dockerfiles for specific purposes. A
common convention is to name these `<something>.Dockerfile`

From: https://docs.docker.com/build/concepts/dockerfile/#filename

## Summary of changes
- Rename `Dockerfile.build-tools` -> `build-tools.Dockerfile`
- Rename `compute/Dockerfile.compute-node` ->
`compute/compute-node.Dockerfile`
---
 .github/workflows/_build-and-test-locally.yml             | 8 ++++----
 .github/workflows/build-build-tools-image.yml             | 2 +-
 .github/workflows/build_and_test.yml                      | 6 +++---
 .github/workflows/check-build-tools-image.yml             | 2 +-
 .github/workflows/trigger-e2e-tests.yml                   | 2 +-
 Dockerfile.build-tools => build-tools.Dockerfile          | 2 +-
 compute/README.md                                         | 6 +++---
 .../{Dockerfile.compute-node => compute-node.Dockerfile}  | 0
 docs/docker.md                                            | 6 +++---
 9 files changed, 17 insertions(+), 17 deletions(-)
 rename Dockerfile.build-tools => build-tools.Dockerfile (99%)
 rename compute/{Dockerfile.compute-node => compute-node.Dockerfile} (100%)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 3aa671fab1..c0f59fbdd5 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -124,28 +124,28 @@ jobs:
         uses: actions/cache@v4
         with:
           path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
 
       - name: Cache postgres v15 build
         id: cache_pg_15
         uses: actions/cache@v4
         with:
           path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
 
       - name: Cache postgres v16 build
         id: cache_pg_16
         uses: actions/cache@v4
         with:
           path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
 
       - name: Cache postgres v17 build
         id: cache_pg_17
         uses: actions/cache@v4
         with:
           path: pg_install/v17
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
 
       - name: Build postgres v14
         if: steps.cache_pg_14.outputs.cache-hit != 'true'
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index 0f05276579..10750089b2 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -82,7 +82,7 @@ jobs:
 
       - uses: docker/build-push-action@v6
         with:
-          file: Dockerfile.build-tools
+          file: build-tools.Dockerfile
           context: .
           provenance: false
           push: true
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index b669eaeb11..1186b9927b 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -683,7 +683,7 @@ jobs:
           provenance: false
           push: true
           pull: true
-          file: compute/Dockerfile.compute-node
+          file: compute/compute-node.Dockerfile
           cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
           cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
           tags: |
@@ -703,7 +703,7 @@ jobs:
           provenance: false
           push: true
           pull: true
-          file: compute/Dockerfile.compute-node
+          file: compute/compute-node.Dockerfile
           target: neon-pg-ext-test
           cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
           cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
@@ -728,7 +728,7 @@ jobs:
           provenance: false
           push: true
           pull: true
-          file: compute/Dockerfile.compute-node
+          file: compute/compute-node.Dockerfile
           cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
           cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
           tags: |
diff --git a/.github/workflows/check-build-tools-image.yml b/.github/workflows/check-build-tools-image.yml
index 807a9ef3bd..a7a15ad58b 100644
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -31,7 +31,7 @@ jobs:
         id: get-build-tools-tag
         env:
           IMAGE_TAG: |
-            ${{ hashFiles('Dockerfile.build-tools',
+            ${{ hashFiles('build-tools.Dockerfile',
                           '.github/workflows/check-build-tools-image.yml',
                           '.github/workflows/build-build-tools-image.yml') }}
         run: |
diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index 5c5423e252..1e7264c55a 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -112,7 +112,7 @@ jobs:
                 # This isn't exhaustive, just the paths that are most directly compute-related.
                 # For example, compute_ctl also depends on libs/utils, but we don't trigger
                 # an e2e run on that.
-                vendor/*|pgxn/*|compute_tools/*|libs/vm_monitor/*|compute/Dockerfile.compute-node)
+                vendor/*|pgxn/*|compute_tools/*|libs/vm_monitor/*|compute/compute-node.Dockerfile)
                   platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
                   ;;
                 *)
diff --git a/Dockerfile.build-tools b/build-tools.Dockerfile
similarity index 99%
rename from Dockerfile.build-tools
rename to build-tools.Dockerfile
index f05c60661c..818cc1b6db 100644
--- a/Dockerfile.build-tools
+++ b/build-tools.Dockerfile
@@ -142,7 +142,7 @@ RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/sourc
 # Use the same version of libicu as the compute nodes so that
 # clusters created using inidb on pageserver can be used by computes.
 #
-# TODO: at this time, Dockerfile.compute-node uses the debian bullseye libicu
+# TODO: at this time, compute-node.Dockerfile uses the debian bullseye libicu
 # package, which is 67.1. We're duplicating that knowledge here, and also, technically,
 # Debian has a few patches on top of 67.1 that we're not adding here.
 ENV ICU_VERSION=67.1
diff --git a/compute/README.md b/compute/README.md
index bb1e42ab53..61e0eee4be 100644
--- a/compute/README.md
+++ b/compute/README.md
@@ -1,7 +1,7 @@
 This directory contains files that are needed to build the compute
 images, or included in the compute images.
 
-Dockerfile.compute-node
+compute-node.Dockerfile
 	To build the compute image
 
 vm-image-spec.yaml
@@ -14,8 +14,8 @@ etc/
 patches/
 	Some extensions need to be patched to work with Neon. This
 	directory contains such patches. They are applied to the extension
-	sources in Dockerfile.compute-node
+	sources in compute-node.Dockerfile
 
 In addition to these, postgres itself, the neon postgres extension,
 and compute_ctl are built and copied into the compute image by
-Dockerfile.compute-node.
+compute-node.Dockerfile.
diff --git a/compute/Dockerfile.compute-node b/compute/compute-node.Dockerfile
similarity index 100%
rename from compute/Dockerfile.compute-node
rename to compute/compute-node.Dockerfile
diff --git a/docs/docker.md b/docs/docker.md
index d16311c27b..0914a00082 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -5,7 +5,7 @@
 Currently we build two main images:
 
 - [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
-- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/Dockerfile.compute-node).
+- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/compute-node.Dockerfile).
 
 And additional intermediate image:
 
@@ -56,7 +56,7 @@ CREATE TABLE
 postgres=# insert into t values(1, 1);
 INSERT 0 1
 postgres=# select * from t;
- key | value 
+ key | value
 -----+-------
    1 | 1
 (1 row)
@@ -84,4 +84,4 @@ Access http://localhost:9001 and sign in.
 - Username: `minio`
 - Password: `password`
 
-You can see durable pages and WAL data in `neon` bucket.
\ No newline at end of file
+You can see durable pages and WAL data in `neon` bucket.

From 163beaf9ad8521ec28d451d1ea884039efcb8897 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 21 Oct 2024 12:14:19 +0100
Subject: [PATCH 29/48] CI: use build-tools on Debian 12 whenever we use Neon
 artifact (#9463)

## Problem

```
+ /tmp/neon/pg_install/v16/bin/psql '***' -c 'SELECT version()'
/tmp/neon/pg_install/v16/bin/psql: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.33' not found (required by /tmp/neon/pg_install/v16/bin/psql)
/tmp/neon/pg_install/v16/bin/psql: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.34' not found (required by /tmp/neon/pg_install/v16/bin/psql)
/tmp/neon/pg_install/v16/bin/psql: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.32' not found (required by /tmp/neon/pg_install/v16/lib/libpq.so.5)
/tmp/neon/pg_install/v16/bin/psql: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.33' not found (required by /tmp/neon/pg_install/v16/lib/libpq.so.5)
/tmp/neon/pg_install/v16/bin/psql: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.34' not found (required by /tmp/neon/pg_install/v16/lib/libpq.so.5)
```

## Summary of changes
- Use `build-tools:pinned-bookworm` whenever we download Neon artefact
---
 .../workflows/_benchmarking_preparation.yml    |  2 +-
 .github/workflows/benchmarking.yml             | 18 +++++++++---------
 .github/workflows/cloud-regress.yml            |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml
index d60f97320b..5cdc16f248 100644
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -27,7 +27,7 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: neondatabase/build-tools:pinned
+      image: neondatabase/build-tools:pinned-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 32806b89ab..5ccfe48684 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -83,7 +83,7 @@ jobs:
 
     runs-on: ${{ matrix.RUNNER }}
     container:
-      image: neondatabase/build-tools:pinned
+      image: neondatabase/build-tools:pinned-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -178,7 +178,7 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: neondatabase/build-tools:pinned
+      image: neondatabase/build-tools:pinned-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -280,7 +280,7 @@ jobs:
         region_id_default=${{ env.DEFAULT_REGION_ID }}
         runner_default='["self-hosted", "us-east-2", "x64"]'
         runner_azure='["self-hosted", "eastus2", "x64"]'
-        image_default="neondatabase/build-tools:pinned"
+        image_default="neondatabase/build-tools:pinned-bookworm"
         matrix='{
           "pg_version" : [
             16
@@ -299,9 +299,9 @@ jobs:
           "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" },
                       { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
                       { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
                       { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
         }'
 
@@ -665,7 +665,7 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: neondatabase/build-tools:pinned
+      image: neondatabase/build-tools:pinned-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -772,7 +772,7 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: neondatabase/build-tools:pinned
+      image: neondatabase/build-tools:pinned-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -877,7 +877,7 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: neondatabase/build-tools:pinned
+      image: neondatabase/build-tools:pinned-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml
index ecafe183f8..19ebf457b8 100644
--- a/.github/workflows/cloud-regress.yml
+++ b/.github/workflows/cloud-regress.yml
@@ -31,7 +31,7 @@ jobs:
 
     runs-on: us-east-2
     container:
-      image: neondatabase/build-tools:pinned
+      image: neondatabase/build-tools:pinned-bookworm
       options: --init
 
     steps:

From ababa50cce5e05df4d3d9fcf617a1b2625ed3b4a Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Mon, 21 Oct 2024 16:20:39 +0300
Subject: [PATCH 30/48] Use '-f' for make clean in  Makefile compute (#9464)

Use '-f' instead of '--force' because it is impossible to clean the
targets on MacOS
---
 compute/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compute/Makefile b/compute/Makefile
index e2896fe390..08e3c7a68b 100644
--- a/compute/Makefile
+++ b/compute/Makefile
@@ -34,7 +34,7 @@ sql_exporter_autoscaling.yml: $(jsonnet_files)
 
 .PHONY: clean
 clean:
-	rm --force \
+	rm -f \
 		etc/neon_collector.yml \
 		etc/neon_collector_autoscaling.yml \
 		etc/sql_exporter.yml \

From 2dcac94194bedb47e06bc6a98467125e3cfaf07b Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Mon, 21 Oct 2024 17:20:09 +0300
Subject: [PATCH 31/48] proxy: Use common error interface for error handling
 with cplane (#9454)

- Remove obsolete error handles.
- Use one source of truth for cplane errors.
#18468
---
 proxy/src/control_plane/messages.rs     |  6 ++-
 proxy/src/control_plane/provider/mod.rs | 32 +------------
 proxy/src/metrics.rs                    | 16 +------
 proxy/src/proxy/wake_compute.rs         | 62 ++-----------------------
 4 files changed, 12 insertions(+), 104 deletions(-)

diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs
index dae23f7c53..13a54145b1 100644
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -161,6 +161,9 @@ pub(crate) enum Reason {
     /// LockAlreadyTaken indicates that the we attempted to take a lock that was already taken.
     #[serde(rename = "LOCK_ALREADY_TAKEN")]
     LockAlreadyTaken,
+    /// ActiveEndpointsLimitExceeded indicates that the limit of concurrently active endpoints was exceeded.
+    #[serde(rename = "ACTIVE_ENDPOINTS_LIMIT_EXCEEDED")]
+    ActiveEndpointsLimitExceeded,
     #[default]
     #[serde(other)]
     Unknown,
@@ -194,7 +197,8 @@ impl Reason {
             | Reason::ComputeTimeQuotaExceeded
             | Reason::WrittenDataQuotaExceeded
             | Reason::DataTransferQuotaExceeded
-            | Reason::LogicalSizeQuotaExceeded => false,
+            | Reason::LogicalSizeQuotaExceeded
+            | Reason::ActiveEndpointsLimitExceeded => false,
             // transitive error. control plane is currently busy
             // but might be ready soon
             Reason::RunningOperations
diff --git a/proxy/src/control_plane/provider/mod.rs b/proxy/src/control_plane/provider/mod.rs
index a4a330cd5f..88399dffa8 100644
--- a/proxy/src/control_plane/provider/mod.rs
+++ b/proxy/src/control_plane/provider/mod.rs
@@ -87,36 +87,8 @@ pub(crate) mod errors {
                     Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane,
                     Reason::LockAlreadyTaken => ErrorKind::ControlPlane,
                     Reason::RunningOperations => ErrorKind::ControlPlane,
-                    Reason::Unknown => match &**e {
-                        ControlPlaneError {
-                            http_status_code:
-                                http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
-                            ..
-                        } => crate::error::ErrorKind::User,
-                        ControlPlaneError {
-                            http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
-                            error,
-                            ..
-                        } if error
-                            .contains("compute time quota of non-primary branches is exceeded") =>
-                        {
-                            crate::error::ErrorKind::Quota
-                        }
-                        ControlPlaneError {
-                            http_status_code: http::StatusCode::LOCKED,
-                            error,
-                            ..
-                        } if error.contains("quota exceeded")
-                            || error.contains("the limit for current plan reached") =>
-                        {
-                            crate::error::ErrorKind::Quota
-                        }
-                        ControlPlaneError {
-                            http_status_code: http::StatusCode::TOO_MANY_REQUESTS,
-                            ..
-                        } => crate::error::ErrorKind::ServiceRateLimit,
-                        ControlPlaneError { .. } => crate::error::ErrorKind::ControlPlane,
-                    },
+                    Reason::ActiveEndpointsLimitExceeded => ErrorKind::ControlPlane,
+                    Reason::Unknown => ErrorKind::ControlPlane,
                 },
                 ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
             }
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 542826e833..f91fcd4120 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -14,6 +14,7 @@ use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
 use tokio::time::{self, Instant};
 
 use crate::control_plane::messages::ColdStartInfo;
+use crate::error::ErrorKind;
 
 #[derive(MetricGroup)]
 #[metric(new(thread_pool: Arc<ThreadPoolMetrics>))]
@@ -325,23 +326,10 @@ pub enum ConnectionFailureKind {
     ComputeUncached,
 }
 
-#[derive(FixedCardinalityLabel, Copy, Clone)]
-#[label(singleton = "kind")]
-pub enum WakeupFailureKind {
-    BadComputeAddress,
-    ApiTransportError,
-    QuotaExceeded,
-    ApiConsoleLocked,
-    ApiConsoleBadRequest,
-    ApiConsoleOtherServerError,
-    ApiConsoleOtherError,
-    TimeoutError,
-}
-
 #[derive(LabelGroup)]
 #[label(set = ConnectionFailuresBreakdownSet)]
 pub struct ConnectionFailuresBreakdownGroup {
-    pub kind: WakeupFailureKind,
+    pub kind: ErrorKind,
     pub retry: Bool,
 }
 
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index 9dfa485fa4..4e61094264 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -1,15 +1,13 @@
-use hyper::StatusCode;
 use tracing::{error, info, warn};
 
 use super::connect_compute::ComputeConnectBackend;
 use crate::config::RetryConfig;
 use crate::context::RequestMonitoring;
 use crate::control_plane::errors::WakeComputeError;
-use crate::control_plane::messages::{ControlPlaneError, Reason};
 use crate::control_plane::provider::CachedNodeInfo;
+use crate::error::ReportableError;
 use crate::metrics::{
     ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType,
-    WakeupFailureKind,
 };
 use crate::proxy::retry::{retry_after, should_retry};
 
@@ -60,62 +58,8 @@ pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
 }
 
 fn report_error(e: &WakeComputeError, retry: bool) {
-    use crate::control_plane::errors::ApiError;
-    let kind = match e {
-        WakeComputeError::BadComputeAddress(_) => WakeupFailureKind::BadComputeAddress,
-        WakeComputeError::ApiError(ApiError::Transport(_)) => WakeupFailureKind::ApiTransportError,
-        WakeComputeError::ApiError(ApiError::ControlPlane(e)) => match e.get_reason() {
-            Reason::RoleProtected => WakeupFailureKind::ApiConsoleBadRequest,
-            Reason::ResourceNotFound => WakeupFailureKind::ApiConsoleBadRequest,
-            Reason::ProjectNotFound => WakeupFailureKind::ApiConsoleBadRequest,
-            Reason::EndpointNotFound => WakeupFailureKind::ApiConsoleBadRequest,
-            Reason::BranchNotFound => WakeupFailureKind::ApiConsoleBadRequest,
-            Reason::RateLimitExceeded => WakeupFailureKind::ApiConsoleLocked,
-            Reason::NonDefaultBranchComputeTimeExceeded => WakeupFailureKind::QuotaExceeded,
-            Reason::ActiveTimeQuotaExceeded => WakeupFailureKind::QuotaExceeded,
-            Reason::ComputeTimeQuotaExceeded => WakeupFailureKind::QuotaExceeded,
-            Reason::WrittenDataQuotaExceeded => WakeupFailureKind::QuotaExceeded,
-            Reason::DataTransferQuotaExceeded => WakeupFailureKind::QuotaExceeded,
-            Reason::LogicalSizeQuotaExceeded => WakeupFailureKind::QuotaExceeded,
-            Reason::ConcurrencyLimitReached => WakeupFailureKind::ApiConsoleLocked,
-            Reason::LockAlreadyTaken => WakeupFailureKind::ApiConsoleLocked,
-            Reason::RunningOperations => WakeupFailureKind::ApiConsoleLocked,
-            Reason::Unknown => match **e {
-                ControlPlaneError {
-                    http_status_code: StatusCode::LOCKED,
-                    ref error,
-                    ..
-                } if error.contains("written data quota exceeded")
-                    || error.contains("the limit for current plan reached") =>
-                {
-                    WakeupFailureKind::QuotaExceeded
-                }
-                ControlPlaneError {
-                    http_status_code: StatusCode::UNPROCESSABLE_ENTITY,
-                    ref error,
-                    ..
-                } if error.contains("compute time quota of non-primary branches is exceeded") => {
-                    WakeupFailureKind::QuotaExceeded
-                }
-                ControlPlaneError {
-                    http_status_code: StatusCode::LOCKED,
-                    ..
-                } => WakeupFailureKind::ApiConsoleLocked,
-                ControlPlaneError {
-                    http_status_code: StatusCode::BAD_REQUEST,
-                    ..
-                } => WakeupFailureKind::ApiConsoleBadRequest,
-                ControlPlaneError {
-                    http_status_code, ..
-                } if http_status_code.is_server_error() => {
-                    WakeupFailureKind::ApiConsoleOtherServerError
-                }
-                ControlPlaneError { .. } => WakeupFailureKind::ApiConsoleOtherError,
-            },
-        },
-        WakeComputeError::TooManyConnections => WakeupFailureKind::ApiConsoleLocked,
-        WakeComputeError::TooManyConnectionAttempts(_) => WakeupFailureKind::TimeoutError,
-    };
+    let kind = e.get_error_kind();
+
     Metrics::get()
         .proxy
         .connection_failures_breakdown

From aca81f5fa4f3e0f882a9b0d55eef1cdee8ffc168 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 21 Oct 2024 10:59:48 -0400
Subject: [PATCH 32/48] fix(pageserver): make image split layer writer finish
 atomic (#8841)

Part of https://github.com/neondatabase/neon/issues/8836

## Summary of changes

This pull request makes the image layer split writer atomic when
finishing the layers. All the produced layers either finish at the same
time, or discard at the same time. Note that this does not prevent
atomicity when crash, but anyways, it will be cleaned up on pageserver
restart.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 .../src/tenant/storage_layer/delta_layer.rs   |   4 +-
 .../src/tenant/storage_layer/image_layer.rs   |  19 ++
 .../src/tenant/storage_layer/split_writer.rs  | 231 ++++++++++--------
 pageserver/src/tenant/timeline/compaction.rs  |   7 +-
 4 files changed, 149 insertions(+), 112 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index d1079876f8..6332d36dc3 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -515,8 +515,8 @@ impl DeltaLayerWriterInner {
     ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
         let temp_path = self.path.clone();
         let result = self.finish0(key_end, ctx).await;
-        if result.is_err() {
-            tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
+        if let Err(ref e) = result {
+            tracing::info!(%temp_path, "cleaning up temporary file after error during writing: {e}");
             if let Err(e) = std::fs::remove_file(&temp_path) {
                 tracing::warn!(error=%e, %temp_path, "error cleaning up temporary layer file after error during writing");
             }
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 6c1a943470..b1f2557038 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -827,6 +827,25 @@ impl ImageLayerWriterInner {
         self,
         ctx: &RequestContext,
         end_key: Option<Key>,
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+        let temp_path = self.path.clone();
+        let result = self.finish0(ctx, end_key).await;
+        if let Err(ref e) = result {
+            tracing::info!(%temp_path, "cleaning up temporary file after error during writing: {e}");
+            if let Err(e) = std::fs::remove_file(&temp_path) {
+                tracing::warn!(error=%e, %temp_path, "error cleaning up temporary layer file after error during writing");
+            }
+        }
+        result
+    }
+
+    ///
+    /// Finish writing the image layer.
+    ///
+    async fn finish0(
+        self,
+        ctx: &RequestContext,
+        end_key: Option<Key>,
     ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
         let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32;
 
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index b499a0eef4..5bd9a47e2b 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -42,7 +42,7 @@ impl SplitWriterResult {
 pub struct SplitImageLayerWriter {
     inner: ImageLayerWriter,
     target_layer_size: u64,
-    generated_layers: Vec<SplitWriterResult>,
+    generated_layer_writers: Vec<(ImageLayerWriter, PersistentLayerKey)>,
     conf: &'static PageServerConf,
     timeline_id: TimelineId,
     tenant_shard_id: TenantShardId,
@@ -71,7 +71,7 @@ impl SplitImageLayerWriter {
                 ctx,
             )
             .await?,
-            generated_layers: Vec::new(),
+            generated_layer_writers: Vec::new(),
             conf,
             timeline_id,
             tenant_shard_id,
@@ -80,18 +80,12 @@ impl SplitImageLayerWriter {
         })
     }
 
-    pub async fn put_image_with_discard_fn<D, F>(
+    pub async fn put_image(
         &mut self,
         key: Key,
         img: Bytes,
-        tline: &Arc<Timeline>,
         ctx: &RequestContext,
-        discard: D,
-    ) -> anyhow::Result<()>
-    where
-        D: FnOnce(&PersistentLayerKey) -> F,
-        F: Future<Output = bool>,
-    {
+    ) -> anyhow::Result<()> {
         // The current estimation is an upper bound of the space that the key/image could take
         // because we did not consider compression in this estimation. The resulting image layer
         // could be smaller than the target size.
@@ -108,72 +102,83 @@ impl SplitImageLayerWriter {
                 ctx,
             )
             .await?;
-            let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
             let layer_key = PersistentLayerKey {
                 key_range: self.start_key..key,
                 lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
                 is_delta: false,
             };
+            let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
             self.start_key = key;
 
-            if discard(&layer_key).await {
-                drop(prev_image_writer);
-                self.generated_layers
-                    .push(SplitWriterResult::Discarded(layer_key));
-            } else {
-                let (desc, path) = prev_image_writer.finish_with_end_key(key, ctx).await?;
-
-                let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
-                self.generated_layers
-                    .push(SplitWriterResult::Produced(layer));
-            }
+            self.generated_layer_writers
+                .push((prev_image_writer, layer_key));
         }
         self.inner.put_image(key, img, ctx).await
     }
 
-    #[cfg(test)]
-    pub async fn put_image(
-        &mut self,
-        key: Key,
-        img: Bytes,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        self.put_image_with_discard_fn(key, img, tline, ctx, |_| async { false })
-            .await
-    }
-
     pub(crate) async fn finish_with_discard_fn<D, F>(
         self,
         tline: &Arc<Timeline>,
         ctx: &RequestContext,
         end_key: Key,
-        discard: D,
+        discard_fn: D,
     ) -> anyhow::Result<Vec<SplitWriterResult>>
     where
-        D: FnOnce(&PersistentLayerKey) -> F,
+        D: Fn(&PersistentLayerKey) -> F,
         F: Future<Output = bool>,
     {
         let Self {
-            mut generated_layers,
+            mut generated_layer_writers,
             inner,
             ..
         } = self;
-        if inner.num_keys() == 0 {
-            return Ok(generated_layers);
+        if inner.num_keys() != 0 {
+            let layer_key = PersistentLayerKey {
+                key_range: self.start_key..end_key,
+                lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
+                is_delta: false,
+            };
+            generated_layer_writers.push((inner, layer_key));
         }
-        let layer_key = PersistentLayerKey {
-            key_range: self.start_key..end_key,
-            lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
-            is_delta: false,
+        let clean_up_layers = |generated_layers: Vec<SplitWriterResult>| {
+            for produced_layer in generated_layers {
+                if let SplitWriterResult::Produced(image_layer) = produced_layer {
+                    let layer: Layer = image_layer.into();
+                    layer.delete_on_drop();
+                }
+            }
         };
-        if discard(&layer_key).await {
-            generated_layers.push(SplitWriterResult::Discarded(layer_key));
-        } else {
-            let (desc, path) = inner.finish_with_end_key(end_key, ctx).await?;
-            let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
-            generated_layers.push(SplitWriterResult::Produced(layer));
+        // BEGIN: catch every error and do the recovery in the below section
+        let mut generated_layers = Vec::new();
+        for (inner, layer_key) in generated_layer_writers {
+            if discard_fn(&layer_key).await {
+                generated_layers.push(SplitWriterResult::Discarded(layer_key));
+            } else {
+                let layer = match inner
+                    .finish_with_end_key(layer_key.key_range.end, ctx)
+                    .await
+                {
+                    Ok((desc, path)) => {
+                        match Layer::finish_creating(self.conf, tline, desc, &path) {
+                            Ok(layer) => layer,
+                            Err(e) => {
+                                tokio::fs::remove_file(&path).await.ok();
+                                clean_up_layers(generated_layers);
+                                return Err(e);
+                            }
+                        }
+                    }
+                    Err(e) => {
+                        // ImageLayerWriter::finish will clean up the temporary layer if anything goes wrong,
+                        // so we don't need to remove it by ourselves.
+                        clean_up_layers(generated_layers);
+                        return Err(e);
+                    }
+                };
+                generated_layers.push(SplitWriterResult::Produced(layer));
+            }
         }
+        // END: catch every error and do the recovery in the above section
         Ok(generated_layers)
     }
 
@@ -187,11 +192,6 @@ impl SplitImageLayerWriter {
         self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
             .await
     }
-
-    /// This function will be deprecated with #8841.
-    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, ImageLayerWriter)> {
-        Ok((self.generated_layers, self.inner))
-    }
 }
 
 /// A delta writer that takes key-lsn-values and produces multiple delta layers.
@@ -296,8 +296,16 @@ impl SplitDeltaLayerWriter {
                     self.generated_layers
                         .push(SplitWriterResult::Discarded(layer_key));
                 } else {
+                    // `finish` will remove the file if anything goes wrong, while we need to handle deleting temporary
+                    // files for `finish_creating`.
                     let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
-                    let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+                    let delta_layer = match Layer::finish_creating(self.conf, tline, desc, &path) {
+                        Ok(layer) => layer,
+                        Err(e) => {
+                            tokio::fs::remove_file(&path).await.ok();
+                            return Err(e);
+                        }
+                    };
                     self.generated_layers
                         .push(SplitWriterResult::Produced(delta_layer));
                 }
@@ -357,8 +365,16 @@ impl SplitDeltaLayerWriter {
         if discard(&layer_key).await {
             generated_layers.push(SplitWriterResult::Discarded(layer_key));
         } else {
+            // `finish` will remove the file if anything goes wrong, while we need to handle deleting temporary
+            // files for `finish_creating`.
             let (desc, path) = inner.finish(end_key, ctx).await?;
-            let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+            let delta_layer = match Layer::finish_creating(self.conf, tline, desc, &path) {
+                Ok(layer) => layer,
+                Err(e) => {
+                    tokio::fs::remove_file(&path).await.ok();
+                    return Err(e);
+                }
+            };
             generated_layers.push(SplitWriterResult::Produced(delta_layer));
         }
         Ok(generated_layers)
@@ -447,7 +463,7 @@ mod tests {
         .unwrap();
 
         image_writer
-            .put_image(get_key(0), get_img(0), &tline, &ctx)
+            .put_image(get_key(0), get_img(0), &ctx)
             .await
             .unwrap();
         let layers = image_writer
@@ -486,14 +502,18 @@ mod tests {
 
     #[tokio::test]
     async fn write_split() {
+        // Test the split writer with retaining all the layers we have produced (discard=false)
         write_split_helper("split_writer_write_split", false).await;
     }
 
     #[tokio::test]
     async fn write_split_discard() {
-        write_split_helper("split_writer_write_split_discard", false).await;
+        // Test the split writer with discarding all the layers we have produced (discard=true)
+        write_split_helper("split_writer_write_split_discard", true).await;
     }
 
+    /// Test the image+delta writer by writing a large number of images and deltas. If discard is
+    /// set to true, all layers will be discarded.
     async fn write_split_helper(harness_name: &'static str, discard: bool) {
         let harness = TenantHarness::create(harness_name).await.unwrap();
         let (tenant, ctx) = harness.load().await;
@@ -527,9 +547,7 @@ mod tests {
         for i in 0..N {
             let i = i as u32;
             image_writer
-                .put_image_with_discard_fn(get_key(i), get_large_img(), &tline, &ctx, |_| async {
-                    discard
-                })
+                .put_image(get_key(i), get_large_img(), &ctx)
                 .await
                 .unwrap();
             delta_writer
@@ -545,51 +563,54 @@ mod tests {
                 .unwrap();
         }
         let image_layers = image_writer
-            .finish(&tline, &ctx, get_key(N as u32))
+            .finish_with_discard_fn(&tline, &ctx, get_key(N as u32), |_| async { discard })
             .await
             .unwrap();
-        let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap();
-        if discard {
-            for layer in image_layers {
-                layer.into_discarded_layer();
-            }
-            for layer in delta_layers {
-                layer.into_discarded_layer();
-            }
-        } else {
-            let image_layers = image_layers
-                .into_iter()
-                .map(|x| x.into_resident_layer())
-                .collect_vec();
-            let delta_layers = delta_layers
-                .into_iter()
-                .map(|x| x.into_resident_layer())
-                .collect_vec();
-            assert_eq!(image_layers.len(), N / 512 + 1);
-            assert_eq!(delta_layers.len(), N / 512 + 1);
-            assert_eq!(
-                delta_layers.first().unwrap().layer_desc().key_range.start,
-                get_key(0)
-            );
-            assert_eq!(
-                delta_layers.last().unwrap().layer_desc().key_range.end,
-                get_key(N as u32)
-            );
-            for idx in 0..image_layers.len() {
-                assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
-                assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
-                assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
-                assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
-                if idx > 0 {
-                    assert_eq!(
-                        image_layers[idx - 1].layer_desc().key_range.end,
-                        image_layers[idx].layer_desc().key_range.start
-                    );
-                    assert_eq!(
-                        delta_layers[idx - 1].layer_desc().key_range.end,
-                        delta_layers[idx].layer_desc().key_range.start
-                    );
+        let delta_layers = delta_writer
+            .finish_with_discard_fn(&tline, &ctx, |_| async { discard })
+            .await
+            .unwrap();
+        let image_layers = image_layers
+            .into_iter()
+            .map(|x| {
+                if discard {
+                    x.into_discarded_layer()
+                } else {
+                    x.into_resident_layer().layer_desc().key()
                 }
+            })
+            .collect_vec();
+        let delta_layers = delta_layers
+            .into_iter()
+            .map(|x| {
+                if discard {
+                    x.into_discarded_layer()
+                } else {
+                    x.into_resident_layer().layer_desc().key()
+                }
+            })
+            .collect_vec();
+        assert_eq!(image_layers.len(), N / 512 + 1);
+        assert_eq!(delta_layers.len(), N / 512 + 1);
+        assert_eq!(delta_layers.first().unwrap().key_range.start, get_key(0));
+        assert_eq!(
+            delta_layers.last().unwrap().key_range.end,
+            get_key(N as u32)
+        );
+        for idx in 0..image_layers.len() {
+            assert_ne!(image_layers[idx].key_range.start, Key::MIN);
+            assert_ne!(image_layers[idx].key_range.end, Key::MAX);
+            assert_ne!(delta_layers[idx].key_range.start, Key::MIN);
+            assert_ne!(delta_layers[idx].key_range.end, Key::MAX);
+            if idx > 0 {
+                assert_eq!(
+                    image_layers[idx - 1].key_range.end,
+                    image_layers[idx].key_range.start
+                );
+                assert_eq!(
+                    delta_layers[idx - 1].key_range.end,
+                    delta_layers[idx].key_range.start
+                );
             }
         }
     }
@@ -629,11 +650,11 @@ mod tests {
         .unwrap();
 
         image_writer
-            .put_image(get_key(0), get_img(0), &tline, &ctx)
+            .put_image(get_key(0), get_img(0), &ctx)
             .await
             .unwrap();
         image_writer
-            .put_image(get_key(1), get_large_img(), &tline, &ctx)
+            .put_image(get_key(1), get_large_img(), &ctx)
             .await
             .unwrap();
         let layers = image_writer
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 5588363330..5cb1460b29 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -141,9 +141,7 @@ impl KeyHistoryRetention {
                     };
                     stat.produce_image_key(img);
                     if let Some(image_writer) = image_writer.as_mut() {
-                        image_writer
-                            .put_image_with_discard_fn(key, img.clone(), tline, ctx, discard)
-                            .await?;
+                        image_writer.put_image(key, img.clone(), ctx).await?;
                     } else {
                         delta_writer
                             .put_value_with_discard_fn(
@@ -2041,8 +2039,7 @@ impl Timeline {
                     .finish_with_discard_fn(self, ctx, Key::MAX, discard)
                     .await?
             } else {
-                let (layers, _) = writer.take()?;
-                assert!(layers.is_empty(), "image layers produced in dry run mode?");
+                drop(writer);
                 Vec::new()
             }
         } else {

From 49d5e56c084f1fd694cde75d56c2d8ed9049c06e Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Mon, 21 Oct 2024 11:01:25 -0400
Subject: [PATCH 33/48] pageserver: use direct IO for delta and image layer
 reads (#9326)

Part of #8130

## Problem

Pageserver previously goes through the kernel page cache for all the
IOs. The kernel page cache makes light-loaded pageserver have deceptive
fast performance. Using direct IO would offer predictable latencies of
our virtual file IO operations.

In particular for reads, the data pages also have an extremely low
temporal locality because the most frequently accessed pages are cached
on the compute side.

## Summary of changes

This PR enables pageserver to use direct IO for delta layer and image
layer reads. We can ship them separately because these layers are
write-once, read-many, so we will not be mixing buffered IO with direct
IO.

- implement `IoBufferMut`, an buffer type with aligned allocation
(currently set to 512).
- use `IoBufferMut` at all places we are doing reads on image + delta
layers.
- leverage Rust type system and use `IoBufAlignedMut` marker trait to
guarantee that the input buffers for the IO operations are aligned.
- page cache allocation is also made aligned.

_* in-memory layer reads and the write path will be shipped separately._

## Testing

Integration test suite run with O_DIRECT enabled:
https://github.com/neondatabase/neon/pull/9350

## Performance

We evaluated performance based on the `get-page-at-latest-lsn`
benchmark. The results demonstrate a decrease in the number of IOps, no
sigificant change in the latency mean, and an slight improvement on the
p99.9 and p99.99 latencies.


[Benchmark](https://www.notion.so/neondatabase/Benchmark-O_DIRECT-for-image-and-delta-layers-2024-10-01-112f189e00478092a195ea5a0137e706?pvs=4)

## Rollout

We will add `virtual_file_io_mode=direct` region by region to enable
direct IO on image + delta layers.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 pageserver/benches/bench_ingest.rs            |   6 +-
 pageserver/ctl/src/layer_map_analyzer.rs      |   7 +-
 pageserver/ctl/src/layers.rs                  |  13 +-
 pageserver/ctl/src/main.rs                    |   8 +-
 pageserver/src/bin/pageserver.rs              |   6 +-
 pageserver/src/page_cache.rs                  |  16 +-
 pageserver/src/tenant/block_io.rs             |   6 +-
 pageserver/src/tenant/ephemeral_file.rs       |  28 +-
 .../src/tenant/storage_layer/delta_layer.rs   |  15 +-
 .../src/tenant/storage_layer/image_layer.rs   |  19 +-
 .../tenant/storage_layer/inmemory_layer.rs    |   8 +-
 .../inmemory_layer/vectored_dio_read.rs       |  23 +-
 pageserver/src/tenant/vectored_blob_io.rs     |   9 +-
 pageserver/src/virtual_file.rs                |  40 +-
 .../owned_buffers_io/aligned_buffer.rs        |   9 +
 .../aligned_buffer/alignment.rs               |  26 ++
 .../owned_buffers_io/aligned_buffer/buffer.rs | 124 +++++++
 .../aligned_buffer/buffer_mut.rs              | 347 ++++++++++++++++++
 .../owned_buffers_io/aligned_buffer/raw.rs    | 216 +++++++++++
 .../owned_buffers_io/aligned_buffer/slice.rs  |  40 ++
 .../owned_buffers_io/io_buf_aligned.rs        |   9 +
 .../owned_buffers_io/io_buf_ext.rs            |   3 +
 22 files changed, 899 insertions(+), 79 deletions(-)
 create mode 100644 pageserver/src/virtual_file/owned_buffers_io/aligned_buffer.rs
 create mode 100644 pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs
 create mode 100644 pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs
 create mode 100644 pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
 create mode 100644 pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs
 create mode 100644 pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs
 create mode 100644 pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs

diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index 821c8008a9..d98b23acce 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -164,7 +164,11 @@ fn criterion_benchmark(c: &mut Criterion) {
     let conf: &'static PageServerConf = Box::leak(Box::new(
         pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
     ));
-    virtual_file::init(16384, virtual_file::io_engine_for_bench());
+    virtual_file::init(
+        16384,
+        virtual_file::io_engine_for_bench(),
+        conf.virtual_file_io_mode,
+    );
     page_cache::init(conf.page_cache_size);
 
     {
diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
index 151b94cf62..7dd2a5d05c 100644
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -7,6 +7,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
+use pageserver::virtual_file::api::IoMode;
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
 use std::ops::Range;
@@ -152,7 +153,11 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
 
     // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
-    pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
+    pageserver::virtual_file::init(
+        10,
+        virtual_file::api::IoEngineKind::StdFs,
+        IoMode::preferred(),
+    );
     pageserver::page_cache::init(100);
 
     let mut total_delta_layers = 0usize;
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index fd948bf2ef..c0b2b6ae89 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -11,6 +11,7 @@ use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
 use pageserver::tenant::storage_layer::{delta_layer, image_layer};
 use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer};
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
+use pageserver::virtual_file::api::IoMode;
 use pageserver::{page_cache, virtual_file};
 use pageserver::{
     repository::{Key, KEY_SIZE},
@@ -59,7 +60,11 @@ pub(crate) enum LayerCmd {
 
 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
     let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
-    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
+    virtual_file::init(
+        10,
+        virtual_file::api::IoEngineKind::StdFs,
+        IoMode::preferred(),
+    );
     page_cache::init(100);
     let file = VirtualFile::open(path, ctx).await?;
     let file_id = page_cache::next_file_id();
@@ -190,7 +195,11 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
             new_tenant_id,
             new_timeline_id,
         } => {
-            pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
+            pageserver::virtual_file::init(
+                10,
+                virtual_file::api::IoEngineKind::StdFs,
+                IoMode::preferred(),
+            );
             pageserver::page_cache::init(100);
 
             let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index c96664d346..f506caec5b 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -24,7 +24,7 @@ use pageserver::{
     page_cache,
     task_mgr::TaskKind,
     tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
-    virtual_file,
+    virtual_file::{self, api::IoMode},
 };
 use pageserver_api::shard::TenantShardId;
 use postgres_ffi::ControlFileData;
@@ -205,7 +205,11 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
 
 async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
     // Basic initialization of things that don't change after startup
-    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
+    virtual_file::init(
+        10,
+        virtual_file::api::IoEngineKind::StdFs,
+        IoMode::preferred(),
+    );
     page_cache::init(100);
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
     dump_layerfile_from_path(path, true, &ctx).await
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index f71a3d2653..c6659345f9 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -167,7 +167,11 @@ fn main() -> anyhow::Result<()> {
     let scenario = failpoint_support::init();
 
     // Basic initialization of things that don't change after startup
-    virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
+    virtual_file::init(
+        conf.max_file_descriptors,
+        conf.virtual_file_io_engine,
+        conf.virtual_file_io_mode,
+    );
     page_cache::init(conf.page_cache_size);
 
     start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs
index f386c825b8..45bf02362a 100644
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -82,6 +82,7 @@ use once_cell::sync::OnceCell;
 use crate::{
     context::RequestContext,
     metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
+    virtual_file::{IoBufferMut, IoPageSlice},
 };
 
 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -144,7 +145,7 @@ struct SlotInner {
     key: Option<CacheKey>,
     // for `coalesce_readers_permit`
     permit: std::sync::Mutex<Weak<PinnedSlotsPermit>>,
-    buf: &'static mut [u8; PAGE_SZ],
+    buf: IoPageSlice<'static>,
 }
 
 impl Slot {
@@ -234,13 +235,13 @@ impl std::ops::Deref for PageReadGuard<'_> {
     type Target = [u8; PAGE_SZ];
 
     fn deref(&self) -> &Self::Target {
-        self.slot_guard.buf
+        self.slot_guard.buf.deref()
     }
 }
 
 impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
     fn as_ref(&self) -> &[u8; PAGE_SZ] {
-        self.slot_guard.buf
+        self.slot_guard.buf.as_ref()
     }
 }
 
@@ -266,7 +267,7 @@ enum PageWriteGuardState<'i> {
 impl std::ops::DerefMut for PageWriteGuard<'_> {
     fn deref_mut(&mut self) -> &mut Self::Target {
         match &mut self.state {
-            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
+            PageWriteGuardState::Invalid { inner, _permit } => inner.buf.deref_mut(),
             PageWriteGuardState::Downgraded => unreachable!(),
         }
     }
@@ -277,7 +278,7 @@ impl std::ops::Deref for PageWriteGuard<'_> {
 
     fn deref(&self) -> &Self::Target {
         match &self.state {
-            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
+            PageWriteGuardState::Invalid { inner, _permit } => inner.buf.deref(),
             PageWriteGuardState::Downgraded => unreachable!(),
         }
     }
@@ -643,7 +644,7 @@ impl PageCache {
         // We could use Vec::leak here, but that potentially also leaks
         // uninitialized reserved capacity. With into_boxed_slice and Box::leak
         // this is avoided.
-        let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());
+        let page_buffer = IoBufferMut::with_capacity_zeroed(num_pages * PAGE_SZ).leak();
 
         let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
         size_metrics.max_bytes.set_page_sz(num_pages);
@@ -652,7 +653,8 @@ impl PageCache {
         let slots = page_buffer
             .chunks_exact_mut(PAGE_SZ)
             .map(|chunk| {
-                let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();
+                // SAFETY: Each chunk has `PAGE_SZ` (8192) bytes, greater than 512, still aligned.
+                let buf = unsafe { IoPageSlice::new_unchecked(chunk.try_into().unwrap()) };
 
                 Slot {
                     inner: tokio::sync::RwLock::new(SlotInner {
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index 1c82e5454d..2bd7f2d619 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -5,6 +5,8 @@
 use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::context::RequestContext;
 use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
+#[cfg(test)]
+use crate::virtual_file::IoBufferMut;
 use crate::virtual_file::VirtualFile;
 use bytes::Bytes;
 use std::ops::Deref;
@@ -40,7 +42,7 @@ pub enum BlockLease<'a> {
     #[cfg(test)]
     Arc(std::sync::Arc<[u8; PAGE_SZ]>),
     #[cfg(test)]
-    Vec(Vec<u8>),
+    IoBufferMut(IoBufferMut),
 }
 
 impl From<PageReadGuard<'static>> for BlockLease<'static> {
@@ -67,7 +69,7 @@ impl Deref for BlockLease<'_> {
             #[cfg(test)]
             BlockLease::Arc(v) => v.deref(),
             #[cfg(test)]
-            BlockLease::Vec(v) => {
+            BlockLease::IoBufferMut(v) => {
                 TryFrom::try_from(&v[..]).expect("caller must ensure that v has PAGE_SZ")
             }
         }
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index a62a47f9a7..de0abab4c0 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -6,10 +6,11 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache;
 use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
+use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
 use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
 use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
 use crate::virtual_file::owned_buffers_io::write::Buffer;
-use crate::virtual_file::{self, owned_buffers_io, VirtualFile};
+use crate::virtual_file::{self, owned_buffers_io, IoBufferMut, VirtualFile};
 use bytes::BytesMut;
 use camino::Utf8PathBuf;
 use num_traits::Num;
@@ -107,15 +108,18 @@ impl EphemeralFile {
         self.page_cache_file_id
     }
 
-    pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
+    pub(crate) async fn load_to_io_buf(
+        &self,
+        ctx: &RequestContext,
+    ) -> Result<IoBufferMut, io::Error> {
         let size = self.len().into_usize();
-        let vec = Vec::with_capacity(size);
-        let (slice, nread) = self.read_exact_at_eof_ok(0, vec.slice_full(), ctx).await?;
+        let buf = IoBufferMut::with_capacity(size);
+        let (slice, nread) = self.read_exact_at_eof_ok(0, buf.slice_full(), ctx).await?;
         assert_eq!(nread, size);
-        let vec = slice.into_inner();
-        assert_eq!(vec.len(), nread);
-        assert_eq!(vec.capacity(), size, "we shouldn't be reallocating");
-        Ok(vec)
+        let buf = slice.into_inner();
+        assert_eq!(buf.len(), nread);
+        assert_eq!(buf.capacity(), size, "we shouldn't be reallocating");
+        Ok(buf)
     }
 
     /// Returns the offset at which the first byte of the input was written, for use
@@ -158,7 +162,7 @@ impl EphemeralFile {
 }
 
 impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile {
-    async fn read_exact_at_eof_ok<'a, 'b, B: tokio_epoll_uring::IoBufMut + Send>(
+    async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>(
         &'b self,
         start: u64,
         dst: tokio_epoll_uring::Slice<B>,
@@ -345,7 +349,7 @@ mod tests {
         assert!(file.len() as usize == write_nbytes);
         for i in 0..write_nbytes {
             assert_eq!(value_offsets[i], i.into_u64());
-            let buf = Vec::with_capacity(1);
+            let buf = IoBufferMut::with_capacity(1);
             let (buf_slice, nread) = file
                 .read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx)
                 .await
@@ -385,7 +389,7 @@ mod tests {
 
         // assert the state is as this test expects it to be
         assert_eq!(
-            &file.load_to_vec(&ctx).await.unwrap(),
+            &file.load_to_io_buf(&ctx).await.unwrap(),
             &content[0..cap + cap / 2]
         );
         let md = file
@@ -440,7 +444,7 @@ mod tests {
                 let (buf, nread) = file
                     .read_exact_at_eof_ok(
                         start.into_u64(),
-                        Vec::with_capacity(len).slice_full(),
+                        IoBufferMut::with_capacity(len).slice_full(),
                         ctx,
                     )
                     .await
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 6332d36dc3..ceae1d4b1a 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -44,11 +44,11 @@ use crate::tenant::vectored_blob_io::{
 };
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
+use crate::virtual_file::IoBufferMut;
 use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{anyhow, bail, ensure, Context, Result};
-use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
@@ -1002,7 +1002,7 @@ impl DeltaLayerInner {
             .0
             .into();
         let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
-        let mut buf = Some(BytesMut::with_capacity(buf_size));
+        let mut buf = Some(IoBufferMut::with_capacity(buf_size));
 
         // Note that reads are processed in reverse order (from highest key+lsn).
         // This is the order that `ReconstructState` requires such that it can
@@ -1029,7 +1029,7 @@ impl DeltaLayerInner {
 
                     // We have "lost" the buffer since the lower level IO api
                     // doesn't return the buffer on error. Allocate a new one.
-                    buf = Some(BytesMut::with_capacity(buf_size));
+                    buf = Some(IoBufferMut::with_capacity(buf_size));
 
                     continue;
                 }
@@ -1203,7 +1203,7 @@ impl DeltaLayerInner {
             .map(|x| x.0.get())
             .unwrap_or(8192);
 
-        let mut buffer = Some(BytesMut::with_capacity(max_read_size));
+        let mut buffer = Some(IoBufferMut::with_capacity(max_read_size));
 
         // FIXME: buffering of DeltaLayerWriter
         let mut per_blob_copy = Vec::new();
@@ -1561,12 +1561,11 @@ impl<'a> DeltaLayerIterator<'a> {
         let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file);
         let mut next_batch = std::collections::VecDeque::new();
         let buf_size = plan.size();
-        let buf = BytesMut::with_capacity(buf_size);
+        let buf = IoBufferMut::with_capacity(buf_size);
         let blobs_buf = vectored_blob_reader
             .read_blobs(&plan, buf, self.ctx)
             .await?;
-        let frozen_buf = blobs_buf.buf.freeze();
-        let view = BufView::new_bytes(frozen_buf);
+        let view = BufView::new_slice(&blobs_buf.buf);
         for meta in blobs_buf.blobs.iter() {
             let blob_read = meta.read(&view).await?;
             let value = Value::des(&blob_read)?;
@@ -1941,7 +1940,7 @@ pub(crate) mod test {
                 &vectored_reads,
                 constants::MAX_VECTORED_READ_BYTES,
             );
-            let mut buf = Some(BytesMut::with_capacity(buf_size));
+            let mut buf = Some(IoBufferMut::with_capacity(buf_size));
 
             for read in vectored_reads {
                 let blobs_buf = vectored_blob_reader
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index b1f2557038..fa058833d4 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -41,10 +41,11 @@ use crate::tenant::vectored_blob_io::{
 };
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
+use crate::virtual_file::IoBufferMut;
 use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{anyhow, bail, ensure, Context, Result};
-use bytes::{Bytes, BytesMut};
+use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
 use itertools::Itertools;
@@ -547,10 +548,10 @@ impl ImageLayerInner {
         for read in plan.into_iter() {
             let buf_size = read.size();
 
-            let buf = BytesMut::with_capacity(buf_size);
+            let buf = IoBufferMut::with_capacity(buf_size);
             let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
-            let frozen_buf = blobs_buf.buf.freeze();
-            let view = BufView::new_bytes(frozen_buf);
+
+            let view = BufView::new_slice(&blobs_buf.buf);
 
             for meta in blobs_buf.blobs.iter() {
                 let img_buf = meta.read(&view).await?;
@@ -609,13 +610,12 @@ impl ImageLayerInner {
                 }
             }
 
-            let buf = BytesMut::with_capacity(buf_size);
+            let buf = IoBufferMut::with_capacity(buf_size);
             let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await;
 
             match res {
                 Ok(blobs_buf) => {
-                    let frozen_buf = blobs_buf.buf.freeze();
-                    let view = BufView::new_bytes(frozen_buf);
+                    let view = BufView::new_slice(&blobs_buf.buf);
                     for meta in blobs_buf.blobs.iter() {
                         let img_buf = meta.read(&view).await;
 
@@ -1069,12 +1069,11 @@ impl<'a> ImageLayerIterator<'a> {
         let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file);
         let mut next_batch = std::collections::VecDeque::new();
         let buf_size = plan.size();
-        let buf = BytesMut::with_capacity(buf_size);
+        let buf = IoBufferMut::with_capacity(buf_size);
         let blobs_buf = vectored_blob_reader
             .read_blobs(&plan, buf, self.ctx)
             .await?;
-        let frozen_buf = blobs_buf.buf.freeze();
-        let view = BufView::new_bytes(frozen_buf);
+        let view = BufView::new_slice(&blobs_buf.buf);
         for meta in blobs_buf.blobs.iter() {
             let img_buf = meta.read(&view).await?;
             next_batch.push_back((
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index e487bee1f2..7573ddb5cc 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -14,7 +14,6 @@ use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::{l0_flush, page_cache};
 use anyhow::{anyhow, Context, Result};
-use bytes::Bytes;
 use camino::Utf8PathBuf;
 use pageserver_api::key::CompactKey;
 use pageserver_api::keyspace::KeySpace;
@@ -809,9 +808,8 @@ impl InMemoryLayer {
 
         match l0_flush_global_state {
             l0_flush::Inner::Direct { .. } => {
-                let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
-
-                let file_contents = Bytes::from(file_contents);
+                let file_contents = inner.file.load_to_io_buf(ctx).await?;
+                let file_contents = file_contents.freeze();
 
                 for (key, vec_map) in inner.index.iter() {
                     // Write all page versions
@@ -825,7 +823,7 @@ impl InMemoryLayer {
                             len,
                             will_init,
                         } = entry;
-                        let buf = Bytes::slice(&file_contents, pos as usize..(pos + len) as usize);
+                        let buf = file_contents.slice(pos as usize..(pos + len) as usize);
                         let (_buf, res) = delta_layer_writer
                             .put_value_bytes(
                                 Key::from_compact(*key),
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
index 0683e15659..a4bb3a6bfc 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
@@ -9,6 +9,7 @@ use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
 use crate::{
     assert_u64_eq_usize::{U64IsUsize, UsizeIsU64},
     context::RequestContext,
+    virtual_file::{owned_buffers_io::io_buf_aligned::IoBufAlignedMut, IoBufferMut},
 };
 
 /// The file interface we require. At runtime, this is a [`crate::tenant::ephemeral_file::EphemeralFile`].
@@ -24,7 +25,7 @@ pub trait File: Send {
     /// [`std::io::ErrorKind::UnexpectedEof`] error if the file is shorter than `start+dst.len()`.
     ///
     /// No guarantees are made about the remaining bytes in `dst` in case of a short read.
-    async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
+    async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>(
         &'b self,
         start: u64,
         dst: Slice<B>,
@@ -227,7 +228,7 @@ where
 
     // Execute physical reads and fill the logical read buffers
     // TODO: pipelined reads; prefetch;
-    let get_io_buffer = |nchunks| Vec::with_capacity(nchunks * DIO_CHUNK_SIZE);
+    let get_io_buffer = |nchunks| IoBufferMut::with_capacity(nchunks * DIO_CHUNK_SIZE);
     for PhysicalRead {
         start_chunk_no,
         nchunks,
@@ -459,7 +460,7 @@ mod tests {
         let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
         let file = InMemoryFile::new_random(10);
         let test_read = |pos, len| {
-            let buf = vec![0; len];
+            let buf = IoBufferMut::with_capacity_zeroed(len);
             let fut = file.read_exact_at_eof_ok(pos, buf.slice_full(), &ctx);
             use futures::FutureExt;
             let (slice, nread) = fut
@@ -470,9 +471,9 @@ mod tests {
             buf.truncate(nread);
             buf
         };
-        assert_eq!(test_read(0, 1), &file.content[0..1]);
-        assert_eq!(test_read(1, 2), &file.content[1..3]);
-        assert_eq!(test_read(9, 2), &file.content[9..]);
+        assert_eq!(&test_read(0, 1), &file.content[0..1]);
+        assert_eq!(&test_read(1, 2), &file.content[1..3]);
+        assert_eq!(&test_read(9, 2), &file.content[9..]);
         assert!(test_read(10, 2).is_empty());
         assert!(test_read(11, 2).is_empty());
     }
@@ -609,7 +610,7 @@ mod tests {
     }
 
     impl<'x> File for RecorderFile<'x> {
-        async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
+        async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>(
             &'b self,
             start: u64,
             dst: Slice<B>,
@@ -782,7 +783,7 @@ mod tests {
             2048,  1024 => Err("foo".to_owned()),
         };
 
-        let buf = Vec::with_capacity(512);
+        let buf = IoBufferMut::with_capacity(512);
         let (buf, nread) = mock_file
             .read_exact_at_eof_ok(0, buf.slice_full(), &ctx)
             .await
@@ -790,7 +791,7 @@ mod tests {
         assert_eq!(nread, 512);
         assert_eq!(&buf.into_inner()[..nread], &[0; 512]);
 
-        let buf = Vec::with_capacity(512);
+        let buf = IoBufferMut::with_capacity(512);
         let (buf, nread) = mock_file
             .read_exact_at_eof_ok(512, buf.slice_full(), &ctx)
             .await
@@ -798,7 +799,7 @@ mod tests {
         assert_eq!(nread, 512);
         assert_eq!(&buf.into_inner()[..nread], &[1; 512]);
 
-        let buf = Vec::with_capacity(512);
+        let buf = IoBufferMut::with_capacity(512);
         let (buf, nread) = mock_file
             .read_exact_at_eof_ok(1024, buf.slice_full(), &ctx)
             .await
@@ -806,7 +807,7 @@ mod tests {
         assert_eq!(nread, 10);
         assert_eq!(&buf.into_inner()[..nread], &[2; 10]);
 
-        let buf = Vec::with_capacity(1024);
+        let buf = IoBufferMut::with_capacity(1024);
         let err = mock_file
             .read_exact_at_eof_ok(2048, buf.slice_full(), &ctx)
             .await
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 0c03791034..dfe2352310 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -18,7 +18,7 @@
 use std::collections::BTreeMap;
 use std::ops::Deref;
 
-use bytes::{Bytes, BytesMut};
+use bytes::Bytes;
 use pageserver_api::key::Key;
 use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::BoundedBuf;
@@ -27,6 +27,7 @@ use utils::vec_map::VecMap;
 
 use crate::context::RequestContext;
 use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
+use crate::virtual_file::IoBufferMut;
 use crate::virtual_file::{self, VirtualFile};
 
 /// Metadata bundled with the start and end offset of a blob.
@@ -158,7 +159,7 @@ impl std::fmt::Display for VectoredBlob {
 /// Return type of [`VectoredBlobReader::read_blobs`]
 pub struct VectoredBlobsBuf {
     /// Buffer for all blobs in this read
-    pub buf: BytesMut,
+    pub buf: IoBufferMut,
     /// Offsets into the buffer and metadata for all blobs in this read
     pub blobs: Vec<VectoredBlob>,
 }
@@ -441,7 +442,7 @@ impl<'a> VectoredBlobReader<'a> {
     pub async fn read_blobs(
         &self,
         read: &VectoredRead,
-        buf: BytesMut,
+        buf: IoBufferMut,
         ctx: &RequestContext,
     ) -> Result<VectoredBlobsBuf, std::io::Error> {
         assert!(read.size() > 0);
@@ -916,7 +917,7 @@ mod tests {
 
         // Multiply by two (compressed data might need more space), and add a few bytes for the header
         let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16;
-        let mut buf = BytesMut::with_capacity(reserved_bytes);
+        let mut buf = IoBufferMut::with_capacity(reserved_bytes);
 
         let vectored_blob_reader = VectoredBlobReader::new(&file);
         let meta = BlobMeta {
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 5a364b7aaf..daa8b99ab0 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -18,6 +18,9 @@ use crate::page_cache::{PageWriteGuard, PAGE_SZ};
 use crate::tenant::TENANTS_SEGMENT_NAME;
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
+use owned_buffers_io::aligned_buffer::buffer::AlignedBuffer;
+use owned_buffers_io::aligned_buffer::{AlignedBufferMut, AlignedSlice, ConstAlign};
+use owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
 use owned_buffers_io::io_buf_ext::FullSlice;
 use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver_api::shard::TenantShardId;
@@ -55,6 +58,8 @@ pub(crate) mod owned_buffers_io {
     //! but for the time being we're proving out the primitives in the neon.git repo
     //! for faster iteration.
 
+    pub(crate) mod aligned_buffer;
+    pub(crate) mod io_buf_aligned;
     pub(crate) mod io_buf_ext;
     pub(crate) mod slice;
     pub(crate) mod write;
@@ -196,7 +201,7 @@ impl VirtualFile {
         ctx: &RequestContext,
     ) -> Result<Slice<Buf>, Error>
     where
-        Buf: IoBufMut + Send,
+        Buf: IoBufAlignedMut + Send,
     {
         self.inner.read_exact_at(slice, offset, ctx).await
     }
@@ -771,7 +776,7 @@ impl VirtualFileInner {
         ctx: &RequestContext,
     ) -> Result<Slice<Buf>, Error>
     where
-        Buf: IoBufMut + Send,
+        Buf: IoBufAlignedMut + Send,
     {
         let assert_we_return_original_bounds = if cfg!(debug_assertions) {
             Some((slice.stable_ptr() as usize, slice.bytes_total()))
@@ -1222,12 +1227,14 @@ impl VirtualFileInner {
         ctx: &RequestContext,
     ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
         use crate::page_cache::PAGE_SZ;
-        let slice = Vec::with_capacity(PAGE_SZ).slice_full();
+        let slice = IoBufferMut::with_capacity(PAGE_SZ).slice_full();
         assert_eq!(slice.bytes_total(), PAGE_SZ);
         let slice = self
             .read_exact_at(slice, blknum as u64 * (PAGE_SZ as u64), ctx)
             .await?;
-        Ok(crate::tenant::block_io::BlockLease::Vec(slice.into_inner()))
+        Ok(crate::tenant::block_io::BlockLease::IoBufferMut(
+            slice.into_inner(),
+        ))
     }
 
     async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
@@ -1325,10 +1332,11 @@ impl OpenFiles {
 /// server startup.
 ///
 #[cfg(not(test))]
-pub fn init(num_slots: usize, engine: IoEngineKind) {
+pub fn init(num_slots: usize, engine: IoEngineKind, mode: IoMode) {
     if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
         panic!("virtual_file::init called twice");
     }
+    set_io_mode(mode);
     io_engine::init(engine);
     crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
 }
@@ -1357,6 +1365,11 @@ pub(crate) const fn get_io_buffer_alignment() -> usize {
     DEFAULT_IO_BUFFER_ALIGNMENT
 }
 
+pub(crate) type IoBufferMut = AlignedBufferMut<ConstAlign<{ get_io_buffer_alignment() }>>;
+pub(crate) type IoBuffer = AlignedBuffer<ConstAlign<{ get_io_buffer_alignment() }>>;
+pub(crate) type IoPageSlice<'a> =
+    AlignedSlice<'a, PAGE_SZ, ConstAlign<{ get_io_buffer_alignment() }>>;
+
 static IO_MODE: AtomicU8 = AtomicU8::new(IoMode::preferred() as u8);
 
 pub(crate) fn set_io_mode(mode: IoMode) {
@@ -1395,10 +1408,10 @@ mod tests {
     impl MaybeVirtualFile {
         async fn read_exact_at(
             &self,
-            mut slice: tokio_epoll_uring::Slice<Vec<u8>>,
+            mut slice: tokio_epoll_uring::Slice<IoBufferMut>,
             offset: u64,
             ctx: &RequestContext,
-        ) -> Result<tokio_epoll_uring::Slice<Vec<u8>>, Error> {
+        ) -> Result<tokio_epoll_uring::Slice<IoBufferMut>, Error> {
             match self {
                 MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(slice, offset, ctx).await,
                 MaybeVirtualFile::File(file) => {
@@ -1466,12 +1479,13 @@ mod tests {
             len: usize,
             ctx: &RequestContext,
         ) -> Result<String, Error> {
-            let slice = Vec::with_capacity(len).slice_full();
+            let slice = IoBufferMut::with_capacity(len).slice_full();
             assert_eq!(slice.bytes_total(), len);
             let slice = self.read_exact_at(slice, pos, ctx).await?;
-            let vec = slice.into_inner();
-            assert_eq!(vec.len(), len);
-            Ok(String::from_utf8(vec).unwrap())
+            let buf = slice.into_inner();
+            assert_eq!(buf.len(), len);
+
+            Ok(String::from_utf8(buf.to_vec()).unwrap())
         }
     }
 
@@ -1695,7 +1709,7 @@ mod tests {
             let files = files.clone();
             let ctx = ctx.detached_child(TaskKind::UnitTest, DownloadBehavior::Error);
             let hdl = rt.spawn(async move {
-                let mut buf = vec![0u8; SIZE];
+                let mut buf = IoBufferMut::with_capacity_zeroed(SIZE);
                 let mut rng = rand::rngs::OsRng;
                 for _ in 1..1000 {
                     let f = &files[rng.gen_range(0..files.len())];
@@ -1704,7 +1718,7 @@ mod tests {
                         .await
                         .unwrap()
                         .into_inner();
-                    assert!(buf == SAMPLE);
+                    assert!(buf[..] == SAMPLE);
                 }
             });
             hdls.push(hdl);
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer.rs
new file mode 100644
index 0000000000..8ffc29b93d
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer.rs
@@ -0,0 +1,9 @@
+pub mod alignment;
+pub mod buffer;
+pub mod buffer_mut;
+pub mod raw;
+pub mod slice;
+
+pub use alignment::*;
+pub use buffer_mut::AlignedBufferMut;
+pub use slice::AlignedSlice;
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs
new file mode 100644
index 0000000000..933b78a13b
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs
@@ -0,0 +1,26 @@
+pub trait Alignment: std::marker::Unpin + 'static {
+    /// Returns the required alignments.
+    fn align(&self) -> usize;
+}
+
+/// Alignment at compile time.
+#[derive(Debug)]
+pub struct ConstAlign<const A: usize>;
+
+impl<const A: usize> Alignment for ConstAlign<A> {
+    fn align(&self) -> usize {
+        A
+    }
+}
+
+/// Alignment at run time.
+#[derive(Debug)]
+pub struct RuntimeAlign {
+    align: usize,
+}
+
+impl Alignment for RuntimeAlign {
+    fn align(&self) -> usize {
+        self.align
+    }
+}
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs
new file mode 100644
index 0000000000..2fba6d699b
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs
@@ -0,0 +1,124 @@
+use std::{
+    ops::{Deref, Range, RangeBounds},
+    sync::Arc,
+};
+
+use super::{alignment::Alignment, raw::RawAlignedBuffer};
+
+/// An shared, immutable aligned buffer type.
+pub struct AlignedBuffer<A: Alignment> {
+    /// Shared raw buffer.
+    raw: Arc<RawAlignedBuffer<A>>,
+    /// Range that specifies the current slice.
+    range: Range<usize>,
+}
+
+impl<A: Alignment> AlignedBuffer<A> {
+    /// Creates an immutable `IoBuffer` from the raw buffer
+    pub(super) fn from_raw(raw: RawAlignedBuffer<A>, range: Range<usize>) -> Self {
+        AlignedBuffer {
+            raw: Arc::new(raw),
+            range,
+        }
+    }
+
+    /// Returns the number of bytes in the buffer, also referred to as its 'length'.
+    #[inline]
+    pub fn len(&self) -> usize {
+        self.range.len()
+    }
+
+    /// Returns the alignment of the buffer.
+    #[inline]
+    pub fn align(&self) -> usize {
+        self.raw.align()
+    }
+
+    #[inline]
+    fn as_ptr(&self) -> *const u8 {
+        // SAFETY: `self.range.start` is guaranteed to be within [0, self.len()).
+        unsafe { self.raw.as_ptr().add(self.range.start) }
+    }
+
+    /// Extracts a slice containing the entire buffer.
+    ///
+    /// Equivalent to `&s[..]`.
+    #[inline]
+    fn as_slice(&self) -> &[u8] {
+        &self.raw.as_slice()[self.range.start..self.range.end]
+    }
+
+    /// Returns a slice of self for the index range `[begin..end)`.
+    pub fn slice(&self, range: impl RangeBounds<usize>) -> Self {
+        use core::ops::Bound;
+        let len = self.len();
+
+        let begin = match range.start_bound() {
+            Bound::Included(&n) => n,
+            Bound::Excluded(&n) => n.checked_add(1).expect("out of range"),
+            Bound::Unbounded => 0,
+        };
+
+        let end = match range.end_bound() {
+            Bound::Included(&n) => n.checked_add(1).expect("out of range"),
+            Bound::Excluded(&n) => n,
+            Bound::Unbounded => len,
+        };
+
+        assert!(
+            begin <= end,
+            "range start must not be greater than end: {:?} <= {:?}",
+            begin,
+            end,
+        );
+        assert!(
+            end <= len,
+            "range end out of bounds: {:?} <= {:?}",
+            end,
+            len,
+        );
+
+        let begin = self.range.start + begin;
+        let end = self.range.start + end;
+
+        AlignedBuffer {
+            raw: Arc::clone(&self.raw),
+            range: begin..end,
+        }
+    }
+}
+
+impl<A: Alignment> Deref for AlignedBuffer<A> {
+    type Target = [u8];
+
+    fn deref(&self) -> &Self::Target {
+        self.as_slice()
+    }
+}
+
+impl<A: Alignment> AsRef<[u8]> for AlignedBuffer<A> {
+    fn as_ref(&self) -> &[u8] {
+        self.as_slice()
+    }
+}
+
+impl<A: Alignment> PartialEq<[u8]> for AlignedBuffer<A> {
+    fn eq(&self, other: &[u8]) -> bool {
+        self.as_slice().eq(other)
+    }
+}
+
+/// SAFETY: the underlying buffer references a stable memory region.
+unsafe impl<A: Alignment> tokio_epoll_uring::IoBuf for AlignedBuffer<A> {
+    fn stable_ptr(&self) -> *const u8 {
+        self.as_ptr()
+    }
+
+    fn bytes_init(&self) -> usize {
+        self.len()
+    }
+
+    fn bytes_total(&self) -> usize {
+        self.len()
+    }
+}
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
new file mode 100644
index 0000000000..b3675d1aea
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
@@ -0,0 +1,347 @@
+use std::ops::{Deref, DerefMut};
+
+use super::{
+    alignment::{Alignment, ConstAlign},
+    buffer::AlignedBuffer,
+    raw::RawAlignedBuffer,
+};
+
+/// A mutable aligned buffer type.
+#[derive(Debug)]
+pub struct AlignedBufferMut<A: Alignment> {
+    raw: RawAlignedBuffer<A>,
+}
+
+impl<const A: usize> AlignedBufferMut<ConstAlign<A>> {
+    /// Constructs a new, empty `IoBufferMut` with at least the specified capacity and alignment.
+    ///
+    /// The buffer will be able to hold at most `capacity` elements and will never resize.
+    ///
+    ///
+    /// # Panics
+    ///
+    /// Panics if the new capacity exceeds `isize::MAX` _bytes_, or if the following alignment requirement is not met:
+    /// * `align` must not be zero,
+    ///
+    /// * `align` must be a power of two,
+    ///
+    /// * `capacity`, when rounded up to the nearest multiple of `align`,
+    ///    must not overflow isize (i.e., the rounded value must be
+    ///    less than or equal to `isize::MAX`).
+    pub fn with_capacity(capacity: usize) -> Self {
+        AlignedBufferMut {
+            raw: RawAlignedBuffer::with_capacity(capacity),
+        }
+    }
+
+    /// Constructs a new `IoBufferMut` with at least the specified capacity and alignment, filled with zeros.
+    pub fn with_capacity_zeroed(capacity: usize) -> Self {
+        use bytes::BufMut;
+        let mut buf = Self::with_capacity(capacity);
+        buf.put_bytes(0, capacity);
+        // SAFETY: `put_bytes` filled the entire buffer.
+        unsafe { buf.set_len(capacity) };
+        buf
+    }
+}
+
+impl<A: Alignment> AlignedBufferMut<A> {
+    /// Returns the total number of bytes the buffer can hold.
+    #[inline]
+    pub fn capacity(&self) -> usize {
+        self.raw.capacity()
+    }
+
+    /// Returns the alignment of the buffer.
+    #[inline]
+    pub fn align(&self) -> usize {
+        self.raw.align()
+    }
+
+    /// Returns the number of bytes in the buffer, also referred to as its 'length'.
+    #[inline]
+    pub fn len(&self) -> usize {
+        self.raw.len()
+    }
+
+    /// Force the length of the buffer to `new_len`.
+    #[inline]
+    unsafe fn set_len(&mut self, new_len: usize) {
+        self.raw.set_len(new_len)
+    }
+
+    #[inline]
+    fn as_ptr(&self) -> *const u8 {
+        self.raw.as_ptr()
+    }
+
+    #[inline]
+    fn as_mut_ptr(&mut self) -> *mut u8 {
+        self.raw.as_mut_ptr()
+    }
+
+    /// Extracts a slice containing the entire buffer.
+    ///
+    /// Equivalent to `&s[..]`.
+    #[inline]
+    fn as_slice(&self) -> &[u8] {
+        self.raw.as_slice()
+    }
+
+    /// Extracts a mutable slice of the entire buffer.
+    ///
+    /// Equivalent to `&mut s[..]`.
+    fn as_mut_slice(&mut self) -> &mut [u8] {
+        self.raw.as_mut_slice()
+    }
+
+    /// Drops the all the contents of the buffer, setting its length to `0`.
+    #[inline]
+    pub fn clear(&mut self) {
+        self.raw.clear()
+    }
+
+    /// Reserves capacity for at least `additional` more bytes to be inserted
+    /// in the given `IoBufferMut`. The collection may reserve more space to
+    /// speculatively avoid frequent reallocations. After calling `reserve`,
+    /// capacity will be greater than or equal to `self.len() + additional`.
+    /// Does nothing if capacity is already sufficient.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the new capacity exceeds `isize::MAX` _bytes_.
+    pub fn reserve(&mut self, additional: usize) {
+        self.raw.reserve(additional);
+    }
+
+    /// Shortens the buffer, keeping the first len bytes.
+    pub fn truncate(&mut self, len: usize) {
+        self.raw.truncate(len);
+    }
+
+    /// Consumes and leaks the `IoBufferMut`, returning a mutable reference to the contents, &'a mut [u8].
+    pub fn leak<'a>(self) -> &'a mut [u8] {
+        self.raw.leak()
+    }
+
+    pub fn freeze(self) -> AlignedBuffer<A> {
+        let len = self.len();
+        AlignedBuffer::from_raw(self.raw, 0..len)
+    }
+}
+
+impl<A: Alignment> Deref for AlignedBufferMut<A> {
+    type Target = [u8];
+
+    fn deref(&self) -> &Self::Target {
+        self.as_slice()
+    }
+}
+
+impl<A: Alignment> DerefMut for AlignedBufferMut<A> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.as_mut_slice()
+    }
+}
+
+impl<A: Alignment> AsRef<[u8]> for AlignedBufferMut<A> {
+    fn as_ref(&self) -> &[u8] {
+        self.as_slice()
+    }
+}
+
+impl<A: Alignment> AsMut<[u8]> for AlignedBufferMut<A> {
+    fn as_mut(&mut self) -> &mut [u8] {
+        self.as_mut_slice()
+    }
+}
+
+impl<A: Alignment> PartialEq<[u8]> for AlignedBufferMut<A> {
+    fn eq(&self, other: &[u8]) -> bool {
+        self.as_slice().eq(other)
+    }
+}
+
+/// SAFETY: When advancing the internal cursor, the caller needs to make sure the bytes advcanced past have been initialized.
+unsafe impl<A: Alignment> bytes::BufMut for AlignedBufferMut<A> {
+    #[inline]
+    fn remaining_mut(&self) -> usize {
+        // Although a `Vec` can have at most isize::MAX bytes, we never want to grow `IoBufferMut`.
+        // Thus, it can have at most `self.capacity` bytes.
+        self.capacity() - self.len()
+    }
+
+    // SAFETY: Caller needs to make sure the bytes being advanced past have been initialized.
+    #[inline]
+    unsafe fn advance_mut(&mut self, cnt: usize) {
+        let len = self.len();
+        let remaining = self.remaining_mut();
+
+        if remaining < cnt {
+            panic_advance(cnt, remaining);
+        }
+
+        // Addition will not overflow since the sum is at most the capacity.
+        self.set_len(len + cnt);
+    }
+
+    #[inline]
+    fn chunk_mut(&mut self) -> &mut bytes::buf::UninitSlice {
+        let cap = self.capacity();
+        let len = self.len();
+
+        // SAFETY: Since `self.ptr` is valid for `cap` bytes, `self.ptr.add(len)` must be
+        // valid for `cap - len` bytes. The subtraction will not underflow since
+        // `len <= cap`.
+        unsafe {
+            bytes::buf::UninitSlice::from_raw_parts_mut(self.as_mut_ptr().add(len), cap - len)
+        }
+    }
+}
+
+/// Panic with a nice error message.
+#[cold]
+fn panic_advance(idx: usize, len: usize) -> ! {
+    panic!(
+        "advance out of bounds: the len is {} but advancing by {}",
+        len, idx
+    );
+}
+
+/// Safety: [`AlignedBufferMut`] has exclusive ownership of the io buffer,
+/// and the underlying pointer remains stable while io-uring is owning the buffer.
+/// The tokio-epoll-uring crate itself will not resize the buffer and will respect
+/// [`tokio_epoll_uring::IoBuf::bytes_total`].
+unsafe impl<A: Alignment> tokio_epoll_uring::IoBuf for AlignedBufferMut<A> {
+    fn stable_ptr(&self) -> *const u8 {
+        self.as_ptr()
+    }
+
+    fn bytes_init(&self) -> usize {
+        self.len()
+    }
+
+    fn bytes_total(&self) -> usize {
+        self.capacity()
+    }
+}
+
+// SAFETY: See above.
+unsafe impl<A: Alignment> tokio_epoll_uring::IoBufMut for AlignedBufferMut<A> {
+    fn stable_mut_ptr(&mut self) -> *mut u8 {
+        self.as_mut_ptr()
+    }
+
+    unsafe fn set_init(&mut self, init_len: usize) {
+        if self.len() < init_len {
+            self.set_len(init_len);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+
+    const ALIGN: usize = 4 * 1024;
+    type TestIoBufferMut = AlignedBufferMut<ConstAlign<ALIGN>>;
+
+    #[test]
+    fn test_with_capacity() {
+        let v = TestIoBufferMut::with_capacity(ALIGN * 4);
+        assert_eq!(v.len(), 0);
+        assert_eq!(v.capacity(), ALIGN * 4);
+        assert_eq!(v.align(), ALIGN);
+        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+
+        let v = TestIoBufferMut::with_capacity(ALIGN / 2);
+        assert_eq!(v.len(), 0);
+        assert_eq!(v.capacity(), ALIGN / 2);
+        assert_eq!(v.align(), ALIGN);
+        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+    }
+
+    #[test]
+    fn test_with_capacity_zeroed() {
+        let v = TestIoBufferMut::with_capacity_zeroed(ALIGN);
+        assert_eq!(v.len(), ALIGN);
+        assert_eq!(v.capacity(), ALIGN);
+        assert_eq!(v.align(), ALIGN);
+        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+        assert_eq!(&v[..], &[0; ALIGN])
+    }
+
+    #[test]
+    fn test_reserve() {
+        use bytes::BufMut;
+        let mut v = TestIoBufferMut::with_capacity(ALIGN);
+        let capacity = v.capacity();
+        v.reserve(capacity);
+        assert_eq!(v.capacity(), capacity);
+        let data = [b'a'; ALIGN];
+        v.put(&data[..]);
+        v.reserve(capacity);
+        assert!(v.capacity() >= capacity * 2);
+        assert_eq!(&v[..], &data[..]);
+        let capacity = v.capacity();
+        v.clear();
+        v.reserve(capacity);
+        assert_eq!(capacity, v.capacity());
+    }
+
+    #[test]
+    fn test_bytes_put() {
+        use bytes::BufMut;
+        let mut v = TestIoBufferMut::with_capacity(ALIGN * 4);
+        let x = [b'a'; ALIGN];
+
+        for _ in 0..2 {
+            for _ in 0..4 {
+                v.put(&x[..]);
+            }
+            assert_eq!(v.len(), ALIGN * 4);
+            assert_eq!(v.capacity(), ALIGN * 4);
+            assert_eq!(v.align(), ALIGN);
+            assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+            v.clear()
+        }
+        assert_eq!(v.len(), 0);
+        assert_eq!(v.capacity(), ALIGN * 4);
+        assert_eq!(v.align(), ALIGN);
+        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_bytes_put_panic() {
+        use bytes::BufMut;
+        const ALIGN: usize = 4 * 1024;
+        let mut v = TestIoBufferMut::with_capacity(ALIGN * 4);
+        let x = [b'a'; ALIGN];
+        for _ in 0..5 {
+            v.put_slice(&x[..]);
+        }
+    }
+
+    #[test]
+    fn test_io_buf_put_slice() {
+        use tokio_epoll_uring::BoundedBufMut;
+        const ALIGN: usize = 4 * 1024;
+        let mut v = TestIoBufferMut::with_capacity(ALIGN);
+        let x = [b'a'; ALIGN];
+
+        for _ in 0..2 {
+            v.put_slice(&x[..]);
+            assert_eq!(v.len(), ALIGN);
+            assert_eq!(v.capacity(), ALIGN);
+            assert_eq!(v.align(), ALIGN);
+            assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+            v.clear()
+        }
+        assert_eq!(v.len(), 0);
+        assert_eq!(v.capacity(), ALIGN);
+        assert_eq!(v.align(), ALIGN);
+        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+    }
+}
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs
new file mode 100644
index 0000000000..6c26dec0db
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs
@@ -0,0 +1,216 @@
+use core::slice;
+use std::{
+    alloc::{self, Layout},
+    cmp,
+    mem::ManuallyDrop,
+};
+
+use super::alignment::{Alignment, ConstAlign};
+
+#[derive(Debug)]
+struct AlignedBufferPtr(*mut u8);
+
+// SAFETY: We gurantees no one besides `IoBufferPtr` itself has the raw pointer.
+unsafe impl Send for AlignedBufferPtr {}
+
+// SAFETY: We gurantees no one besides `IoBufferPtr` itself has the raw pointer.
+unsafe impl Sync for AlignedBufferPtr {}
+
+/// An aligned buffer type.
+#[derive(Debug)]
+pub struct RawAlignedBuffer<A: Alignment> {
+    ptr: AlignedBufferPtr,
+    capacity: usize,
+    len: usize,
+    align: A,
+}
+
+impl<const A: usize> RawAlignedBuffer<ConstAlign<A>> {
+    /// Constructs a new, empty `IoBufferMut` with at least the specified capacity and alignment.
+    ///
+    /// The buffer will be able to hold at most `capacity` elements and will never resize.
+    ///
+    ///
+    /// # Panics
+    ///
+    /// Panics if the new capacity exceeds `isize::MAX` _bytes_, or if the following alignment requirement is not met:
+    /// * `align` must not be zero,
+    ///
+    /// * `align` must be a power of two,
+    ///
+    /// * `capacity`, when rounded up to the nearest multiple of `align`,
+    ///    must not overflow isize (i.e., the rounded value must be
+    ///    less than or equal to `isize::MAX`).
+    pub fn with_capacity(capacity: usize) -> Self {
+        let align = ConstAlign::<A>;
+        let layout = Layout::from_size_align(capacity, align.align()).expect("Invalid layout");
+
+        // SAFETY:  Making an allocation with a sized and aligned layout. The memory is manually freed with the same layout.
+        let ptr = unsafe {
+            let ptr = alloc::alloc(layout);
+            if ptr.is_null() {
+                alloc::handle_alloc_error(layout);
+            }
+            AlignedBufferPtr(ptr)
+        };
+
+        RawAlignedBuffer {
+            ptr,
+            capacity,
+            len: 0,
+            align,
+        }
+    }
+}
+
+impl<A: Alignment> RawAlignedBuffer<A> {
+    /// Returns the total number of bytes the buffer can hold.
+    #[inline]
+    pub fn capacity(&self) -> usize {
+        self.capacity
+    }
+
+    /// Returns the alignment of the buffer.
+    #[inline]
+    pub fn align(&self) -> usize {
+        self.align.align()
+    }
+
+    /// Returns the number of bytes in the buffer, also referred to as its 'length'.
+    #[inline]
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    /// Force the length of the buffer to `new_len`.
+    #[inline]
+    pub unsafe fn set_len(&mut self, new_len: usize) {
+        debug_assert!(new_len <= self.capacity());
+        self.len = new_len;
+    }
+
+    #[inline]
+    pub fn as_ptr(&self) -> *const u8 {
+        self.ptr.0
+    }
+
+    #[inline]
+    pub fn as_mut_ptr(&mut self) -> *mut u8 {
+        self.ptr.0
+    }
+
+    /// Extracts a slice containing the entire buffer.
+    ///
+    /// Equivalent to `&s[..]`.
+    #[inline]
+    pub fn as_slice(&self) -> &[u8] {
+        // SAFETY: The pointer is valid and `len` bytes are initialized.
+        unsafe { slice::from_raw_parts(self.as_ptr(), self.len) }
+    }
+
+    /// Extracts a mutable slice of the entire buffer.
+    ///
+    /// Equivalent to `&mut s[..]`.
+    pub fn as_mut_slice(&mut self) -> &mut [u8] {
+        // SAFETY: The pointer is valid and `len` bytes are initialized.
+        unsafe { slice::from_raw_parts_mut(self.as_mut_ptr(), self.len) }
+    }
+
+    /// Drops the all the contents of the buffer, setting its length to `0`.
+    #[inline]
+    pub fn clear(&mut self) {
+        self.len = 0;
+    }
+
+    /// Reserves capacity for at least `additional` more bytes to be inserted
+    /// in the given `IoBufferMut`. The collection may reserve more space to
+    /// speculatively avoid frequent reallocations. After calling `reserve`,
+    /// capacity will be greater than or equal to `self.len() + additional`.
+    /// Does nothing if capacity is already sufficient.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the new capacity exceeds `isize::MAX` _bytes_.
+    pub fn reserve(&mut self, additional: usize) {
+        if additional > self.capacity() - self.len() {
+            self.reserve_inner(additional);
+        }
+    }
+
+    fn reserve_inner(&mut self, additional: usize) {
+        let Some(required_cap) = self.len().checked_add(additional) else {
+            capacity_overflow()
+        };
+
+        let old_capacity = self.capacity();
+        let align = self.align();
+        // This guarantees exponential growth. The doubling cannot overflow
+        // because `cap <= isize::MAX` and the type of `cap` is `usize`.
+        let cap = cmp::max(old_capacity * 2, required_cap);
+
+        if !is_valid_alloc(cap) {
+            capacity_overflow()
+        }
+        let new_layout = Layout::from_size_align(cap, self.align()).expect("Invalid layout");
+
+        let old_ptr = self.as_mut_ptr();
+
+        // SAFETY: old allocation was allocated with std::alloc::alloc with the same layout,
+        // and we panics on null pointer.
+        let (ptr, cap) = unsafe {
+            let old_layout = Layout::from_size_align_unchecked(old_capacity, align);
+            let ptr = alloc::realloc(old_ptr, old_layout, new_layout.size());
+            if ptr.is_null() {
+                alloc::handle_alloc_error(new_layout);
+            }
+            (AlignedBufferPtr(ptr), cap)
+        };
+
+        self.ptr = ptr;
+        self.capacity = cap;
+    }
+
+    /// Shortens the buffer, keeping the first len bytes.
+    pub fn truncate(&mut self, len: usize) {
+        if len > self.len {
+            return;
+        }
+        self.len = len;
+    }
+
+    /// Consumes and leaks the `IoBufferMut`, returning a mutable reference to the contents, &'a mut [u8].
+    pub fn leak<'a>(self) -> &'a mut [u8] {
+        let mut buf = ManuallyDrop::new(self);
+        // SAFETY: leaking the buffer as intended.
+        unsafe { slice::from_raw_parts_mut(buf.as_mut_ptr(), buf.len) }
+    }
+}
+
+fn capacity_overflow() -> ! {
+    panic!("capacity overflow")
+}
+
+// We need to guarantee the following:
+// * We don't ever allocate `> isize::MAX` byte-size objects.
+// * We don't overflow `usize::MAX` and actually allocate too little.
+//
+// On 64-bit we just need to check for overflow since trying to allocate
+// `> isize::MAX` bytes will surely fail. On 32-bit and 16-bit we need to add
+// an extra guard for this in case we're running on a platform which can use
+// all 4GB in user-space, e.g., PAE or x32.
+#[inline]
+fn is_valid_alloc(alloc_size: usize) -> bool {
+    !(usize::BITS < 64 && alloc_size > isize::MAX as usize)
+}
+
+impl<A: Alignment> Drop for RawAlignedBuffer<A> {
+    fn drop(&mut self) {
+        // SAFETY: memory was allocated with std::alloc::alloc with the same layout.
+        unsafe {
+            alloc::dealloc(
+                self.as_mut_ptr(),
+                Layout::from_size_align_unchecked(self.capacity, self.align.align()),
+            )
+        }
+    }
+}
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs
new file mode 100644
index 0000000000..6cecf34c1c
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs
@@ -0,0 +1,40 @@
+use std::ops::{Deref, DerefMut};
+
+use super::alignment::{Alignment, ConstAlign};
+
+/// Newtype for an aligned slice.
+pub struct AlignedSlice<'a, const N: usize, A: Alignment> {
+    /// underlying byte slice
+    buf: &'a mut [u8; N],
+    /// alignment marker
+    _align: A,
+}
+
+impl<'a, const N: usize, const A: usize> AlignedSlice<'a, N, ConstAlign<A>> {
+    /// Create a new aligned slice from a mutable byte slice. The input must already satisify the alignment.
+    pub unsafe fn new_unchecked(buf: &'a mut [u8; N]) -> Self {
+        let _align = ConstAlign::<A>;
+        assert_eq!(buf.as_ptr().align_offset(_align.align()), 0);
+        AlignedSlice { buf, _align }
+    }
+}
+
+impl<'a, const N: usize, A: Alignment> Deref for AlignedSlice<'a, N, A> {
+    type Target = [u8; N];
+
+    fn deref(&self) -> &Self::Target {
+        self.buf
+    }
+}
+
+impl<'a, const N: usize, A: Alignment> DerefMut for AlignedSlice<'a, N, A> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.buf
+    }
+}
+
+impl<'a, const N: usize, A: Alignment> AsRef<[u8; N]> for AlignedSlice<'a, N, A> {
+    fn as_ref(&self) -> &[u8; N] {
+        self.buf
+    }
+}
diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs
new file mode 100644
index 0000000000..dba695196e
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs
@@ -0,0 +1,9 @@
+use tokio_epoll_uring::IoBufMut;
+
+use crate::virtual_file::{IoBufferMut, PageWriteGuardBuf};
+
+pub trait IoBufAlignedMut: IoBufMut {}
+
+impl IoBufAlignedMut for IoBufferMut {}
+
+impl IoBufAlignedMut for PageWriteGuardBuf {}
diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
index 7c773b6b21..c3940cf6ce 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
@@ -1,5 +1,6 @@
 //! See [`FullSlice`].
 
+use crate::virtual_file::{IoBuffer, IoBufferMut};
 use bytes::{Bytes, BytesMut};
 use std::ops::{Deref, Range};
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
@@ -76,3 +77,5 @@ macro_rules! impl_io_buf_ext {
 impl_io_buf_ext!(Bytes);
 impl_io_buf_ext!(BytesMut);
 impl_io_buf_ext!(Vec<u8>);
+impl_io_buf_ext!(IoBufferMut);
+impl_io_buf_ext!(IoBuffer);

From 34b6bd416a8df8cf0d51f707beaca30dbdbe2adc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 21 Oct 2024 17:33:05 +0200
Subject: [PATCH 34/48] offloaded timeline list API (#9461)

Add a way to list the offloaded timelines.

Before, one had to look at logs to figure out if a timeline has been
offloaded or not, or use the non-presence of a certain timeline in the
list of normal timelines. Now, one can list them directly.

Part of #8088
---
 libs/pageserver_api/src/models.rs | 17 +++++++
 pageserver/src/http/routes.rs     | 80 ++++++++++++++++++++++++++++++-
 pageserver/src/tenant.rs          | 25 +++++++++-
 3 files changed, 120 insertions(+), 2 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 5b0b6bebe3..e08bf40801 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -684,6 +684,23 @@ pub struct TimelineArchivalConfigRequest {
     pub state: TimelineArchivalState,
 }
 
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct TimelinesInfoAndOffloaded {
+    pub timelines: Vec<TimelineInfo>,
+    pub offloaded: Vec<OffloadedTimelineInfo>,
+}
+
+/// Analog of [`TimelineInfo`] for offloaded timelines.
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct OffloadedTimelineInfo {
+    pub tenant_id: TenantShardId,
+    pub timeline_id: TimelineId,
+    /// Whether the timeline has a parent it has been branched off from or not
+    pub ancestor_timeline_id: Option<TimelineId>,
+    /// Whether to retain the branch lsn at the ancestor or not
+    pub ancestor_retain_lsn: Option<Lsn>,
+}
+
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 8f928fd81b..a254f1683d 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -26,6 +26,7 @@ use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::LocationConfigMode;
 use pageserver_api::models::LsnLease;
 use pageserver_api::models::LsnLeaseRequest;
+use pageserver_api::models::OffloadedTimelineInfo;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigRequest;
@@ -37,6 +38,7 @@ use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
 use pageserver_api::models::TenantSorting;
 use pageserver_api::models::TimelineArchivalConfigRequest;
+use pageserver_api::models::TimelinesInfoAndOffloaded;
 use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::TopTenantShardsRequest;
 use pageserver_api::models::TopTenantShardsResponse;
@@ -81,6 +83,7 @@ use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
+use crate::tenant::OffloadedTimeline;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
 use crate::{disk_usage_eviction_task, tenant};
 use pageserver_api::models::{
@@ -477,6 +480,22 @@ async fn build_timeline_info_common(
     Ok(info)
 }
 
+fn build_timeline_offloaded_info(offloaded: &Arc<OffloadedTimeline>) -> OffloadedTimelineInfo {
+    let &OffloadedTimeline {
+        tenant_shard_id,
+        timeline_id,
+        ancestor_retain_lsn,
+        ancestor_timeline_id,
+        ..
+    } = offloaded.as_ref();
+    OffloadedTimelineInfo {
+        tenant_id: tenant_shard_id,
+        timeline_id,
+        ancestor_retain_lsn,
+        ancestor_timeline_id,
+    }
+}
+
 // healthcheck handler
 async fn status_handler(
     request: Request<Body>,
@@ -643,7 +662,7 @@ async fn timeline_list_handler(
             )
             .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
             .await
-            .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
+            .context("Failed to build timeline info")
             .map_err(ApiError::InternalServerError)?;
 
             response_data.push(timeline_info);
@@ -658,6 +677,62 @@ async fn timeline_list_handler(
     json_response(StatusCode::OK, response_data)
 }
 
+async fn timeline_and_offloaded_list_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let include_non_incremental_logical_size: Option<bool> =
+        parse_query_param(&request, "include-non-incremental-logical-size")?;
+    let force_await_initial_logical_size: Option<bool> =
+        parse_query_param(&request, "force-await-initial-logical-size")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let state = get_state(&request);
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+
+    let response_data = async {
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
+
+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+        let (timelines, offloadeds) = tenant.list_timelines_and_offloaded();
+
+        let mut timeline_infos = Vec::with_capacity(timelines.len());
+        for timeline in timelines {
+            let timeline_info = build_timeline_info(
+                &timeline,
+                include_non_incremental_logical_size.unwrap_or(false),
+                force_await_initial_logical_size.unwrap_or(false),
+                &ctx,
+            )
+            .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
+            .await
+            .context("Failed to build timeline info")
+            .map_err(ApiError::InternalServerError)?;
+
+            timeline_infos.push(timeline_info);
+        }
+        let offloaded_infos = offloadeds
+            .into_iter()
+            .map(|offloaded| build_timeline_offloaded_info(&offloaded))
+            .collect::<Vec<_>>();
+        let res = TimelinesInfoAndOffloaded {
+            timelines: timeline_infos,
+            offloaded: offloaded_infos,
+        };
+        Ok::<TimelinesInfoAndOffloaded, ApiError>(res)
+    }
+    .instrument(info_span!("timeline_and_offloaded_list",
+                tenant_id = %tenant_shard_id.tenant_id,
+                shard_id = %tenant_shard_id.shard_slug()))
+    .await?;
+
+    json_response(StatusCode::OK, response_data)
+}
+
 async fn timeline_preserve_initdb_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
@@ -2993,6 +3068,9 @@ pub fn make_router(
         .get("/v1/tenant/:tenant_shard_id/timeline", |r| {
             api_handler(r, timeline_list_handler)
         })
+        .get("/v1/tenant/:tenant_shard_id/timeline_and_offloaded", |r| {
+            api_handler(r, timeline_and_offloaded_list_handler)
+        })
         .post("/v1/tenant/:tenant_shard_id/timeline", |r| {
             api_handler(r, timeline_create_handler)
         })
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1066d165cd..41d21ef041 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1755,7 +1755,7 @@ impl Tenant {
     }
 
     /// Lists timelines the tenant contains.
-    /// Up to tenant's implementation to omit certain timelines that ar not considered ready for use.
+    /// It's up to callers to omit certain timelines that are not considered ready for use.
     pub fn list_timelines(&self) -> Vec<Arc<Timeline>> {
         self.timelines
             .lock()
@@ -1765,6 +1765,29 @@ impl Tenant {
             .collect()
     }
 
+    /// Lists timelines the tenant manages, including offloaded ones.
+    ///
+    /// It's up to callers to omit certain timelines that are not considered ready for use.
+    pub fn list_timelines_and_offloaded(
+        &self,
+    ) -> (Vec<Arc<Timeline>>, Vec<Arc<OffloadedTimeline>>) {
+        let timelines = self
+            .timelines
+            .lock()
+            .unwrap()
+            .values()
+            .map(Arc::clone)
+            .collect();
+        let offloaded = self
+            .timelines_offloaded
+            .lock()
+            .unwrap()
+            .values()
+            .map(Arc::clone)
+            .collect();
+        (timelines, offloaded)
+    }
+
     pub fn list_timeline_ids(&self) -> Vec<TimelineId> {
         self.timelines.lock().unwrap().keys().cloned().collect()
     }

From 94369af7825ca26fb5aea805f2c85bdb877ceb74 Mon Sep 17 00:00:00 2001
From: David Gomes <david@neon.tech>
Date: Mon, 21 Oct 2024 18:39:30 -0500
Subject: [PATCH 35/48] chore(compute): bumps pg_session_jwt to latest version
 (#9474)

---
 compute/compute-node.Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 74970696b5..6451e309f0 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -975,8 +975,8 @@ ARG PG_VERSION
 RUN case "${PG_VERSION}" in "v17") \
     echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \
     esac && \
-    wget https://github.com/neondatabase/pg_session_jwt/archive/e642528f429dd3f5403845a50191b78d434b84a6.tar.gz -O pg_session_jwt.tar.gz && \
-    echo "1a69210703cc91224785e59a0a67562dd9eed9a0914ac84b11447582ca0d5b93 pg_session_jwt.tar.gz" | sha256sum --check && \
+    wget https://github.com/neondatabase/pg_session_jwt/archive/e1310b08ba51377a19e0559e4d1194883b9b2ba2.tar.gz -O pg_session_jwt.tar.gz && \
+    echo "837932a077888d5545fd54b0abcc79e5f8e37017c2769a930afc2f5c94df6f4e pg_session_jwt.tar.gz" | sha256sum --check && \
     mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release

From 1e8e04bb2c9b2cdb17f680c8d0df697289114e17 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Tue, 22 Oct 2024 09:11:36 +0300
Subject: [PATCH 36/48] safekeeper: refactor timeline initialization (#9362)

Always do timeline init through atomic rename of temp directory. Add
GlobalTimelines::load_temp_timeline which does this, and use it from
both pull_timeline and basic timeline creation. Fixes a collection
of issues:
- previously timeline creation didn't really flushed cfile to disk
  due to 'nothing to do if state didn't change' check;
- even if it did, without tmp dir it is possible to lose the cfile
  but leave timeline dir in place, making it look corrupted;
- tenant directory creation fsync was missing in timeline creation;
- pull_timeline is now protected from concurrent both itself and
  timeline creation;
- now global timelines map entry got special CreationInProgress
  entry type which prevents from anyone getting access to timeline
  while it is being created (previously one could get access to it,
  but it was locked during creation, which is valid but confusing if
  creation failed).

fixes #8927
---
 safekeeper/src/control_file.rs                |  21 +-
 safekeeper/src/copy_timeline.rs               |  11 +-
 safekeeper/src/pull_timeline.rs               | 106 +-----
 safekeeper/src/receive_wal.rs                 |   3 +-
 safekeeper/src/state.rs                       |  39 +-
 safekeeper/src/timeline.rs                    |  91 +----
 safekeeper/src/timelines_global_map.rs        | 339 +++++++++++-------
 safekeeper/src/wal_storage.rs                 |  10 +-
 .../tests/walproposer_sim/safekeeper.rs       |  20 +-
 9 files changed, 290 insertions(+), 350 deletions(-)

diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index 8b252b4ab4..cd82e43780 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -66,22 +66,25 @@ impl FileStorage {
         })
     }
 
-    /// Create file storage for a new timeline, but don't persist it yet.
-    pub fn create_new(
-        timeline_dir: Utf8PathBuf,
+    /// Create and reliably persist new control file at given location.
+    ///
+    /// Note: we normally call this in temp directory for atomic init, so
+    /// interested in FileStorage as a result only in tests.
+    pub async fn create_new(
+        dir: Utf8PathBuf,
         conf: &SafeKeeperConf,
         state: TimelinePersistentState,
     ) -> Result<FileStorage> {
         // we don't support creating new timelines in offloaded state
         assert!(matches!(state.eviction_state, EvictionState::Present));
 
-        let store = FileStorage {
-            timeline_dir,
+        let mut store = FileStorage {
+            timeline_dir: dir,
             no_sync: conf.no_sync,
-            state,
+            state: state.clone(),
             last_persist_at: Instant::now(),
         };
-
+        store.persist(&state).await?;
         Ok(store)
     }
 
@@ -190,8 +193,6 @@ impl TimelinePersistentState {
 
 impl Storage for FileStorage {
     /// Persists state durably to the underlying storage.
-    ///
-    /// For a description, see <https://lwn.net/Articles/457667/>.
     async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()> {
         let _timer = PERSIST_CONTROL_FILE_SECONDS.start_timer();
 
@@ -269,7 +270,7 @@ mod test {
             .await
             .expect("failed to create timeline dir");
         let state = TimelinePersistentState::empty();
-        let storage = FileStorage::create_new(timeline_dir, conf, state.clone())?;
+        let storage = FileStorage::create_new(timeline_dir, conf, state.clone()).await?;
         Ok((storage, state))
     }
 
diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs
index 220988c3ce..52b13dc5e3 100644
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -12,10 +12,10 @@ use tracing::{info, warn};
 use utils::{id::TenantTimelineId, lsn::Lsn};
 
 use crate::{
-    control_file::{FileStorage, Storage},
-    pull_timeline::{create_temp_timeline_dir, load_temp_timeline, validate_temp_timeline},
+    control_file::FileStorage,
     state::TimelinePersistentState,
     timeline::{Timeline, TimelineError, WalResidentTimeline},
+    timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline},
     wal_backup::copy_s3_segments,
     wal_storage::{wal_file_paths, WalReader},
     GlobalTimelines,
@@ -149,17 +149,16 @@ pub async fn handle_request(request: Request) -> Result<()> {
         vec![],
         request.until_lsn,
         start_lsn,
-    );
+    )?;
     new_state.timeline_start_lsn = start_lsn;
     new_state.peer_horizon_lsn = request.until_lsn;
     new_state.backup_lsn = new_backup_lsn;
 
-    let mut file_storage = FileStorage::create_new(tli_dir_path.clone(), conf, new_state.clone())?;
-    file_storage.persist(&new_state).await?;
+    FileStorage::create_new(tli_dir_path.clone(), conf, new_state.clone()).await?;
 
     // now we have a ready timeline in a temp directory
     validate_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?;
-    load_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?;
+    GlobalTimelines::load_temp_timeline(request.destination_ttid, &tli_dir_path, true).await?;
 
     Ok(())
 }
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index c772ae6de7..c7f5165f90 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -1,7 +1,6 @@
 use anyhow::{anyhow, bail, Context, Result};
 use bytes::Bytes;
 use camino::Utf8PathBuf;
-use camino_tempfile::Utf8TempDir;
 use chrono::{DateTime, Utc};
 use futures::{SinkExt, StreamExt, TryStreamExt};
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
@@ -9,7 +8,6 @@ use serde::{Deserialize, Serialize};
 use std::{
     cmp::min,
     io::{self, ErrorKind},
-    sync::Arc,
 };
 use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task};
 use tokio_tar::{Archive, Builder, Header};
@@ -20,7 +18,7 @@ use tokio_util::{
 use tracing::{error, info, instrument};
 
 use crate::{
-    control_file::{self, CONTROL_FILE_NAME},
+    control_file::CONTROL_FILE_NAME,
     debug_dump,
     http::{
         client::{self, Client},
@@ -28,13 +26,14 @@ use crate::{
     },
     safekeeper::Term,
     state::TimelinePersistentState,
-    timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError, WalResidentTimeline},
+    timeline::WalResidentTimeline,
+    timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline},
     wal_backup,
-    wal_storage::{self, open_wal_file, Storage},
-    GlobalTimelines, SafeKeeperConf,
+    wal_storage::open_wal_file,
+    GlobalTimelines,
 };
 use utils::{
-    crashsafe::{durable_rename, fsync_async_opt},
+    crashsafe::fsync_async_opt,
     id::{NodeId, TenantId, TenantTimelineId, TimelineId},
     logging::SecretString,
     lsn::Lsn,
@@ -428,100 +427,9 @@ async fn pull_timeline(
     assert!(status.commit_lsn <= status.flush_lsn);
 
     // Finally, load the timeline.
-    let _tli = load_temp_timeline(conf, ttid, &tli_dir_path).await?;
+    let _tli = GlobalTimelines::load_temp_timeline(ttid, &tli_dir_path, false).await?;
 
     Ok(Response {
         safekeeper_host: host,
     })
 }
-
-/// Create temp directory for a new timeline. It needs to be located on the same
-/// filesystem as the rest of the timelines. It will be automatically deleted when
-/// Utf8TempDir goes out of scope.
-pub async fn create_temp_timeline_dir(
-    conf: &SafeKeeperConf,
-    ttid: TenantTimelineId,
-) -> Result<(Utf8TempDir, Utf8PathBuf)> {
-    // conf.workdir is usually /storage/safekeeper/data
-    // will try to transform it into /storage/safekeeper/tmp
-    let temp_base = conf
-        .workdir
-        .parent()
-        .ok_or(anyhow::anyhow!("workdir has no parent"))?
-        .join("tmp");
-
-    tokio::fs::create_dir_all(&temp_base).await?;
-
-    let tli_dir = camino_tempfile::Builder::new()
-        .suffix("_temptli")
-        .prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id))
-        .tempdir_in(temp_base)?;
-
-    let tli_dir_path = tli_dir.path().to_path_buf();
-
-    Ok((tli_dir, tli_dir_path))
-}
-
-/// Do basic validation of a temp timeline, before moving it to the global map.
-pub async fn validate_temp_timeline(
-    conf: &SafeKeeperConf,
-    ttid: TenantTimelineId,
-    path: &Utf8PathBuf,
-) -> Result<(Lsn, Lsn)> {
-    let control_path = path.join("safekeeper.control");
-
-    let control_store = control_file::FileStorage::load_control_file(control_path)?;
-    if control_store.server.wal_seg_size == 0 {
-        bail!("wal_seg_size is not set");
-    }
-
-    let wal_store = wal_storage::PhysicalStorage::new(&ttid, path.clone(), conf, &control_store)?;
-
-    let commit_lsn = control_store.commit_lsn;
-    let flush_lsn = wal_store.flush_lsn();
-
-    Ok((commit_lsn, flush_lsn))
-}
-
-/// Move timeline from a temp directory to the main storage, and load it to the global map.
-///
-/// This operation is done under a lock to prevent bugs if several concurrent requests are
-/// trying to load the same timeline. Note that it doesn't guard against creating the
-/// timeline with the same ttid, but no one should be doing this anyway.
-pub async fn load_temp_timeline(
-    conf: &SafeKeeperConf,
-    ttid: TenantTimelineId,
-    tmp_path: &Utf8PathBuf,
-) -> Result<Arc<Timeline>> {
-    // Take a lock to prevent concurrent loadings
-    let load_lock = GlobalTimelines::loading_lock().await;
-    let guard = load_lock.lock().await;
-
-    if !matches!(GlobalTimelines::get(ttid), Err(TimelineError::NotFound(_))) {
-        bail!("timeline already exists, cannot overwrite it")
-    }
-
-    // Move timeline dir to the correct location
-    let timeline_path = get_timeline_dir(conf, &ttid);
-
-    info!(
-        "moving timeline {} from {} to {}",
-        ttid, tmp_path, timeline_path
-    );
-    tokio::fs::create_dir_all(get_tenant_dir(conf, &ttid.tenant_id)).await?;
-    // fsync tenant dir creation
-    fsync_async_opt(&conf.workdir, !conf.no_sync).await?;
-    durable_rename(tmp_path, &timeline_path, !conf.no_sync).await?;
-
-    let tli = GlobalTimelines::load_timeline(&guard, ttid)
-        .await
-        .context("Failed to load timeline after copy")?;
-
-    info!(
-        "loaded timeline {}, flush_lsn={}",
-        ttid,
-        tli.get_flush_lsn().await
-    );
-
-    Ok(tli)
-}
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 2a9ca85299..3dbf72298f 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -339,7 +339,8 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
                 };
                 let tli =
                     GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
-                        .await?;
+                        .await
+                        .context("create timeline")?;
                 tli.wal_residence_guard().await?
             }
             _ => {
diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs
index 8ae749ded5..8dd873ee77 100644
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -3,7 +3,7 @@
 
 use std::{cmp::max, ops::Deref};
 
-use anyhow::Result;
+use anyhow::{bail, Result};
 use safekeeper_api::models::TimelineTermBumpResponse;
 use serde::{Deserialize, Serialize};
 use utils::{
@@ -13,7 +13,11 @@ use utils::{
 
 use crate::{
     control_file,
-    safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, Term, TermHistory},
+    safekeeper::{
+        AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, Term, TermHistory,
+        UNKNOWN_SERVER_VERSION,
+    },
+    timeline::TimelineError,
     wal_backup_partial::{self},
 };
 
@@ -91,8 +95,24 @@ impl TimelinePersistentState {
         peers: Vec<NodeId>,
         commit_lsn: Lsn,
         local_start_lsn: Lsn,
-    ) -> TimelinePersistentState {
-        TimelinePersistentState {
+    ) -> anyhow::Result<TimelinePersistentState> {
+        if server_info.wal_seg_size == 0 {
+            bail!(TimelineError::UninitializedWalSegSize(*ttid));
+        }
+
+        if server_info.pg_version == UNKNOWN_SERVER_VERSION {
+            bail!(TimelineError::UninitialinzedPgVersion(*ttid));
+        }
+
+        if commit_lsn < local_start_lsn {
+            bail!(
+                "commit_lsn {} is smaller than local_start_lsn {}",
+                commit_lsn,
+                local_start_lsn
+            );
+        }
+
+        Ok(TimelinePersistentState {
             tenant_id: ttid.tenant_id,
             timeline_id: ttid.timeline_id,
             acceptor_state: AcceptorState {
@@ -115,24 +135,23 @@ impl TimelinePersistentState {
             ),
             partial_backup: wal_backup_partial::State::default(),
             eviction_state: EvictionState::Present,
-        }
+        })
     }
 
     #[cfg(test)]
     pub fn empty() -> Self {
-        use crate::safekeeper::UNKNOWN_SERVER_VERSION;
-
         TimelinePersistentState::new(
             &TenantTimelineId::empty(),
             ServerInfo {
-                pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */
-                system_id: 0,                       /* Postgres system identifier */
-                wal_seg_size: 0,
+                pg_version: 17, /* Postgres server version */
+                system_id: 0,   /* Postgres system identifier */
+                wal_seg_size: 16 * 1024 * 1024,
             },
             vec![],
             Lsn::INVALID,
             Lsn::INVALID,
         )
+        .unwrap()
     }
 }
 
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 41b9490088..dd4d161226 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -27,11 +27,11 @@ use utils::{
 use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 
+use crate::control_file;
 use crate::rate_limit::RateLimiter;
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{
-    AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn,
-    INVALID_TERM,
+    AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, Term, TermLsn,
 };
 use crate::send_wal::WalSenders;
 use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, TimelineState};
@@ -40,7 +40,6 @@ use crate::timeline_manager::{AtomicStatus, ManagerCtl};
 use crate::timelines_set::TimelinesSet;
 use crate::wal_backup::{self, remote_timeline_path};
 use crate::wal_backup_partial::PartialRemoteSegment;
-use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
 
 use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS};
 use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
@@ -326,44 +325,6 @@ pub struct SharedState {
 }
 
 impl SharedState {
-    /// Initialize fresh timeline state without persisting anything to disk.
-    fn create_new(
-        conf: &SafeKeeperConf,
-        ttid: &TenantTimelineId,
-        state: TimelinePersistentState,
-    ) -> Result<Self> {
-        if state.server.wal_seg_size == 0 {
-            bail!(TimelineError::UninitializedWalSegSize(*ttid));
-        }
-
-        if state.server.pg_version == UNKNOWN_SERVER_VERSION {
-            bail!(TimelineError::UninitialinzedPgVersion(*ttid));
-        }
-
-        if state.commit_lsn < state.local_start_lsn {
-            bail!(
-                "commit_lsn {} is higher than local_start_lsn {}",
-                state.commit_lsn,
-                state.local_start_lsn
-            );
-        }
-
-        // We don't want to write anything to disk, because we may have existing timeline there.
-        // These functions should not change anything on disk.
-        let timeline_dir = get_timeline_dir(conf, ttid);
-        let control_store =
-            control_file::FileStorage::create_new(timeline_dir.clone(), conf, state)?;
-        let wal_store =
-            wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?;
-        let sk = SafeKeeper::new(TimelineState::new(control_store), wal_store, conf.my_id)?;
-
-        Ok(Self {
-            sk: StateSK::Loaded(sk),
-            peers_info: PeersInfo(vec![]),
-            wal_removal_on_hold: false,
-        })
-    }
-
     /// Restore SharedState from control file. If file doesn't exist, bails out.
     fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result<Self> {
         let timeline_dir = get_timeline_dir(conf, ttid);
@@ -450,6 +411,8 @@ pub enum TimelineError {
     Cancelled(TenantTimelineId),
     #[error("Timeline {0} was not found in global map")]
     NotFound(TenantTimelineId),
+    #[error("Timeline {0} creation is in progress")]
+    CreationInProgress(TenantTimelineId),
     #[error("Timeline {0} exists on disk, but wasn't loaded on startup")]
     Invalid(TenantTimelineId),
     #[error("Timeline {0} is already exists")]
@@ -514,7 +477,7 @@ pub struct Timeline {
 
 impl Timeline {
     /// Load existing timeline from disk.
-    pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result<Timeline> {
+    pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
         let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();
 
         let shared_state = SharedState::restore(conf, &ttid)?;
@@ -528,7 +491,7 @@ impl Timeline {
 
         let walreceivers = WalReceivers::new();
         let remote_path = remote_timeline_path(&ttid)?;
-        Ok(Timeline {
+        Ok(Arc::new(Timeline {
             ttid,
             remote_path,
             commit_lsn_watch_tx,
@@ -547,47 +510,7 @@ impl Timeline {
             wal_backup_active: AtomicBool::new(false),
             last_removed_segno: AtomicU64::new(0),
             mgr_status: AtomicStatus::new(),
-        })
-    }
-
-    /// Create a new timeline, which is not yet persisted to disk.
-    pub fn create_empty(
-        conf: &SafeKeeperConf,
-        ttid: TenantTimelineId,
-        server_info: ServerInfo,
-        commit_lsn: Lsn,
-        local_start_lsn: Lsn,
-    ) -> Result<Timeline> {
-        let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID);
-        let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) =
-            watch::channel(TermLsn::from((INVALID_TERM, Lsn::INVALID)));
-        let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
-
-        let state =
-            TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
-
-        let walreceivers = WalReceivers::new();
-        let remote_path = remote_timeline_path(&ttid)?;
-        Ok(Timeline {
-            ttid,
-            remote_path,
-            commit_lsn_watch_tx,
-            commit_lsn_watch_rx,
-            term_flush_lsn_watch_tx,
-            term_flush_lsn_watch_rx,
-            shared_state_version_tx,
-            shared_state_version_rx,
-            mutex: RwLock::new(SharedState::create_new(conf, &ttid, state)?),
-            walsenders: WalSenders::new(walreceivers.clone()),
-            walreceivers,
-            cancel: CancellationToken::default(),
-            timeline_dir: get_timeline_dir(conf, &ttid),
-            manager_ctl: ManagerCtl::new(),
-            broker_active: AtomicBool::new(false),
-            wal_backup_active: AtomicBool::new(false),
-            last_removed_segno: AtomicU64::new(0),
-            mgr_status: AtomicStatus::new(),
-        })
+        }))
     }
 
     /// Initialize fresh timeline on disk and start background tasks. If init
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index 866cde3339..538bb6e5d2 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -5,11 +5,14 @@
 use crate::defaults::DEFAULT_EVICTION_CONCURRENCY;
 use crate::rate_limit::RateLimiter;
 use crate::safekeeper::ServerInfo;
+use crate::state::TimelinePersistentState;
 use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError};
 use crate::timelines_set::TimelinesSet;
-use crate::SafeKeeperConf;
+use crate::wal_storage::Storage;
+use crate::{control_file, wal_storage, SafeKeeperConf};
 use anyhow::{bail, Context, Result};
 use camino::Utf8PathBuf;
+use camino_tempfile::Utf8TempDir;
 use once_cell::sync::Lazy;
 use serde::Serialize;
 use std::collections::HashMap;
@@ -17,12 +20,22 @@ use std::str::FromStr;
 use std::sync::atomic::Ordering;
 use std::sync::{Arc, Mutex};
 use std::time::{Duration, Instant};
+use tokio::fs;
 use tracing::*;
+use utils::crashsafe::{durable_rename, fsync_async_opt};
 use utils::id::{TenantId, TenantTimelineId, TimelineId};
 use utils::lsn::Lsn;
 
+// Timeline entry in the global map: either a ready timeline, or mark that it is
+// being created.
+#[derive(Clone)]
+enum GlobalMapTimeline {
+    CreationInProgress,
+    Timeline(Arc<Timeline>),
+}
+
 struct GlobalTimelinesState {
-    timelines: HashMap<TenantTimelineId, Arc<Timeline>>,
+    timelines: HashMap<TenantTimelineId, GlobalMapTimeline>,
 
     // A tombstone indicates this timeline used to exist has been deleted.  These are used to prevent
     // on-demand timeline creation from recreating deleted timelines.  This is only soft-enforced, as
@@ -31,13 +44,9 @@ struct GlobalTimelinesState {
 
     conf: Option<SafeKeeperConf>,
     broker_active_set: Arc<TimelinesSet>,
-    load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
     global_rate_limiter: RateLimiter,
 }
 
-// Used to prevent concurrent timeline loading.
-pub struct TimelineLoadLock;
-
 impl GlobalTimelinesState {
     /// Get configuration, which must be set once during init.
     fn get_conf(&self) -> &SafeKeeperConf {
@@ -55,22 +64,16 @@ impl GlobalTimelinesState {
         )
     }
 
-    /// Insert timeline into the map. Returns error if timeline with the same id already exists.
-    fn try_insert(&mut self, timeline: Arc<Timeline>) -> Result<()> {
-        let ttid = timeline.ttid;
-        if self.timelines.contains_key(&ttid) {
-            bail!(TimelineError::AlreadyExists(ttid));
-        }
-        self.timelines.insert(ttid, timeline);
-        Ok(())
-    }
-
-    /// Get timeline from the map. Returns error if timeline doesn't exist.
+    /// Get timeline from the map. Returns error if timeline doesn't exist or
+    /// creation is in progress.
     fn get(&self, ttid: &TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
-        self.timelines
-            .get(ttid)
-            .cloned()
-            .ok_or(TimelineError::NotFound(*ttid))
+        match self.timelines.get(ttid).cloned() {
+            Some(GlobalMapTimeline::Timeline(tli)) => Ok(tli),
+            Some(GlobalMapTimeline::CreationInProgress) => {
+                Err(TimelineError::CreationInProgress(*ttid))
+            }
+            None => Err(TimelineError::NotFound(*ttid)),
+        }
     }
 
     fn delete(&mut self, ttid: TenantTimelineId) {
@@ -85,7 +88,6 @@ static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
         tombstones: HashMap::new(),
         conf: None,
         broker_active_set: Arc::new(TimelinesSet::default()),
-        load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
         global_rate_limiter: RateLimiter::new(1, 1),
     })
 });
@@ -141,11 +143,10 @@ impl GlobalTimelines {
     /// Loads all timelines for the given tenant to memory. Returns fs::read_dir
     /// errors if any.
     ///
-    /// It is async for update_status_notify sake. Since TIMELINES_STATE lock is
-    /// sync and there is no important reason to make it async (it is always
-    /// held for a short while) we just lock and unlock it for each timeline --
-    /// this function is called during init when nothing else is running, so
-    /// this is fine.
+    /// It is async, but TIMELINES_STATE lock is sync and there is no important
+    /// reason to make it async (it is always held for a short while), so we
+    /// just lock and unlock it for each timeline -- this function is called
+    /// during init when nothing else is running, so this is fine.
     async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> {
         let (conf, broker_active_set, partial_backup_rate_limiter) = {
             let state = TIMELINES_STATE.lock().unwrap();
@@ -163,14 +164,13 @@ impl GlobalTimelines {
                     {
                         let ttid = TenantTimelineId::new(tenant_id, timeline_id);
                         match Timeline::load_timeline(&conf, ttid) {
-                            Ok(timeline) => {
-                                let tli = Arc::new(timeline);
+                            Ok(tli) => {
                                 let mut shared_state = tli.write_shared_state().await;
                                 TIMELINES_STATE
                                     .lock()
                                     .unwrap()
                                     .timelines
-                                    .insert(ttid, tli.clone());
+                                    .insert(ttid, GlobalMapTimeline::Timeline(tli.clone()));
                                 tli.bootstrap(
                                     &mut shared_state,
                                     &conf,
@@ -199,51 +199,6 @@ impl GlobalTimelines {
         Ok(())
     }
 
-    /// Take a lock for timeline loading.
-    pub async fn loading_lock() -> Arc<tokio::sync::Mutex<TimelineLoadLock>> {
-        TIMELINES_STATE.lock().unwrap().load_lock.clone()
-    }
-
-    /// Load timeline from disk to the memory.
-    pub async fn load_timeline<'a>(
-        _guard: &tokio::sync::MutexGuard<'a, TimelineLoadLock>,
-        ttid: TenantTimelineId,
-    ) -> Result<Arc<Timeline>> {
-        let (conf, broker_active_set, partial_backup_rate_limiter) =
-            TIMELINES_STATE.lock().unwrap().get_dependencies();
-
-        match Timeline::load_timeline(&conf, ttid) {
-            Ok(timeline) => {
-                let tli = Arc::new(timeline);
-                let mut shared_state = tli.write_shared_state().await;
-
-                // TODO: prevent concurrent timeline creation/loading
-                {
-                    let mut state = TIMELINES_STATE.lock().unwrap();
-
-                    // We may be have been asked to load a timeline that was previously deleted (e.g. from `pull_timeline.rs`).  We trust
-                    // that the human doing this manual intervention knows what they are doing, and remove its tombstone.
-                    if state.tombstones.remove(&ttid).is_some() {
-                        warn!("Un-deleted timeline {ttid}");
-                    }
-
-                    state.timelines.insert(ttid, tli.clone());
-                }
-
-                tli.bootstrap(
-                    &mut shared_state,
-                    &conf,
-                    broker_active_set,
-                    partial_backup_rate_limiter,
-                );
-                drop(shared_state);
-                Ok(tli)
-            }
-            // If we can't load a timeline, it's bad. Caller will figure it out.
-            Err(e) => bail!("failed to load timeline {}, reason: {:?}", ttid, e),
-        }
-    }
-
     /// Get the number of timelines in the map.
     pub fn timelines_count() -> usize {
         TIMELINES_STATE.lock().unwrap().timelines.len()
@@ -266,7 +221,7 @@ impl GlobalTimelines {
         commit_lsn: Lsn,
         local_start_lsn: Lsn,
     ) -> Result<Arc<Timeline>> {
-        let (conf, broker_active_set, partial_backup_rate_limiter) = {
+        let (conf, _, _) = {
             let state = TIMELINES_STATE.lock().unwrap();
             if let Ok(timeline) = state.get(&ttid) {
                 // Timeline already exists, return it.
@@ -282,55 +237,146 @@ impl GlobalTimelines {
 
         info!("creating new timeline {}", ttid);
 
-        let timeline = Arc::new(Timeline::create_empty(
-            &conf,
-            ttid,
-            server_info,
-            commit_lsn,
-            local_start_lsn,
-        )?);
+        // Do on disk initialization in tmp dir.
+        let (_tmp_dir, tmp_dir_path) = create_temp_timeline_dir(&conf, ttid).await?;
 
-        // Take a lock and finish the initialization holding this mutex. No other threads
-        // can interfere with creation after we will insert timeline into the map.
-        {
-            let mut shared_state = timeline.write_shared_state().await;
+        // TODO: currently we create only cfile. It would be reasonable to
+        // immediately initialize first WAL segment as well.
+        let state =
+            TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?;
+        control_file::FileStorage::create_new(tmp_dir_path.clone(), &conf, state).await?;
+        let timeline = GlobalTimelines::load_temp_timeline(ttid, &tmp_dir_path, true).await?;
+        Ok(timeline)
+    }
 
-            // We can get a race condition here in case of concurrent create calls, but only
-            // in theory. create() will return valid timeline on the next try.
-            TIMELINES_STATE
-                .lock()
-                .unwrap()
-                .try_insert(timeline.clone())?;
+    /// Move timeline from a temp directory to the main storage, and load it to
+    /// the global map. Creating timeline in this way ensures atomicity: rename
+    /// is atomic, so either move of the whole datadir succeeds or it doesn't,
+    /// but corrupted data dir shouldn't be possible.
+    ///
+    /// We'd like to avoid holding map lock while doing IO, so it's a 3 step
+    /// process:
+    /// 1) check the global map that timeline doesn't exist and mark that we're
+    ///    creating it;
+    /// 2) move the directory and load the timeline
+    /// 3) take lock again and insert the timeline into the global map.
+    pub async fn load_temp_timeline(
+        ttid: TenantTimelineId,
+        tmp_path: &Utf8PathBuf,
+        check_tombstone: bool,
+    ) -> Result<Arc<Timeline>> {
+        // Check for existence and mark that we're creating it.
+        let (conf, broker_active_set, partial_backup_rate_limiter) = {
+            let mut state = TIMELINES_STATE.lock().unwrap();
+            match state.timelines.get(&ttid) {
+                Some(GlobalMapTimeline::CreationInProgress) => {
+                    bail!(TimelineError::CreationInProgress(ttid));
+                }
+                Some(GlobalMapTimeline::Timeline(_)) => {
+                    bail!(TimelineError::AlreadyExists(ttid));
+                }
+                _ => {}
+            }
+            if check_tombstone {
+                if state.tombstones.contains_key(&ttid) {
+                    anyhow::bail!("timeline {ttid} is deleted, refusing to recreate");
+                }
+            } else {
+                // We may be have been asked to load a timeline that was previously deleted (e.g. from `pull_timeline.rs`).  We trust
+                // that the human doing this manual intervention knows what they are doing, and remove its tombstone.
+                if state.tombstones.remove(&ttid).is_some() {
+                    warn!("un-deleted timeline {ttid}");
+                }
+            }
+            state
+                .timelines
+                .insert(ttid, GlobalMapTimeline::CreationInProgress);
+            state.get_dependencies()
+        };
 
-            // Write the new timeline to the disk and start background workers.
-            // Bootstrap is transactional, so if it fails, the timeline will be deleted,
-            // and the state on disk should remain unchanged.
-            if let Err(e) = timeline
-                .init_new(
-                    &mut shared_state,
+        // Do the actual move and reflect the result in the map.
+        match GlobalTimelines::install_temp_timeline(ttid, tmp_path, &conf).await {
+            Ok(timeline) => {
+                let mut timeline_shared_state = timeline.write_shared_state().await;
+                let mut state = TIMELINES_STATE.lock().unwrap();
+                assert!(matches!(
+                    state.timelines.get(&ttid),
+                    Some(GlobalMapTimeline::CreationInProgress)
+                ));
+
+                state
+                    .timelines
+                    .insert(ttid, GlobalMapTimeline::Timeline(timeline.clone()));
+                drop(state);
+                timeline.bootstrap(
+                    &mut timeline_shared_state,
                     &conf,
                     broker_active_set,
                     partial_backup_rate_limiter,
-                )
-                .await
-            {
-                // Note: the most likely reason for init failure is that the timeline
-                // directory already exists on disk. This happens when timeline is corrupted
-                // and wasn't loaded from disk on startup because of that. We want to preserve
-                // the timeline directory in this case, for further inspection.
-
-                // TODO: this is an unusual error, perhaps we should send it to sentry
-                // TODO: compute will try to create timeline every second, we should add backoff
-                error!("failed to init new timeline {}: {}", ttid, e);
-
-                // Timeline failed to init, it cannot be used. Remove it from the map.
-                TIMELINES_STATE.lock().unwrap().timelines.remove(&ttid);
-                return Err(e);
+                );
+                drop(timeline_shared_state);
+                Ok(timeline)
+            }
+            Err(e) => {
+                // Init failed, remove the marker from the map
+                let mut state = TIMELINES_STATE.lock().unwrap();
+                assert!(matches!(
+                    state.timelines.get(&ttid),
+                    Some(GlobalMapTimeline::CreationInProgress)
+                ));
+                state.timelines.remove(&ttid);
+                Err(e)
             }
-            // We are done with bootstrap, release the lock, return the timeline.
-            // {} block forces release before .await
         }
-        Ok(timeline)
+    }
+
+    /// Main part of load_temp_timeline: do the move and load.
+    async fn install_temp_timeline(
+        ttid: TenantTimelineId,
+        tmp_path: &Utf8PathBuf,
+        conf: &SafeKeeperConf,
+    ) -> Result<Arc<Timeline>> {
+        let tenant_path = get_tenant_dir(conf, &ttid.tenant_id);
+        let timeline_path = get_timeline_dir(conf, &ttid);
+
+        // We must have already checked that timeline doesn't exist in the map,
+        // but there might be existing datadir: if timeline is corrupted it is
+        // not loaded. We don't want to overwrite such a dir, so check for its
+        // existence.
+        match fs::metadata(&timeline_path).await {
+            Ok(_) => {
+                // Timeline directory exists on disk, we should leave state unchanged
+                // and return error.
+                bail!(TimelineError::Invalid(ttid));
+            }
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
+            Err(e) => {
+                return Err(e.into());
+            }
+        }
+
+        info!(
+            "moving timeline {} from {} to {}",
+            ttid, tmp_path, timeline_path
+        );
+
+        // Now it is safe to move the timeline directory to the correct
+        // location. First, create tenant directory. Ignore error if it already
+        // exists.
+        if let Err(e) = tokio::fs::create_dir(&tenant_path).await {
+            if e.kind() != std::io::ErrorKind::AlreadyExists {
+                return Err(e.into());
+            }
+        }
+        // fsync it
+        fsync_async_opt(&tenant_path, !conf.no_sync).await?;
+        // and its creation
+        fsync_async_opt(&conf.workdir, !conf.no_sync).await?;
+
+        // Do the move.
+        durable_rename(tmp_path, &timeline_path, !conf.no_sync).await?;
+
+        Timeline::load_timeline(conf, ttid)
     }
 
     /// Get a timeline from the global map. If it's not present, it doesn't exist on disk,
@@ -358,8 +404,16 @@ impl GlobalTimelines {
         global_lock
             .timelines
             .values()
-            .filter(|t| !t.is_cancelled())
-            .cloned()
+            .filter_map(|t| match t {
+                GlobalMapTimeline::Timeline(t) => {
+                    if t.is_cancelled() {
+                        None
+                    } else {
+                        Some(t.clone())
+                    }
+                }
+                _ => None,
+            })
             .collect()
     }
 
@@ -370,8 +424,11 @@ impl GlobalTimelines {
         global_lock
             .timelines
             .values()
+            .filter_map(|t| match t {
+                GlobalMapTimeline::Timeline(t) => Some(t.clone()),
+                _ => None,
+            })
             .filter(|t| t.ttid.tenant_id == tenant_id)
-            .cloned()
             .collect()
     }
 
@@ -504,3 +561,45 @@ fn delete_dir(path: Utf8PathBuf) -> Result<bool> {
         Err(e) => Err(e.into()),
     }
 }
+
+/// Create temp directory for a new timeline. It needs to be located on the same
+/// filesystem as the rest of the timelines. It will be automatically deleted when
+/// Utf8TempDir goes out of scope.
+pub async fn create_temp_timeline_dir(
+    conf: &SafeKeeperConf,
+    ttid: TenantTimelineId,
+) -> Result<(Utf8TempDir, Utf8PathBuf)> {
+    let temp_base = conf.workdir.join("tmp");
+
+    tokio::fs::create_dir_all(&temp_base).await?;
+
+    let tli_dir = camino_tempfile::Builder::new()
+        .suffix("_temptli")
+        .prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id))
+        .tempdir_in(temp_base)?;
+
+    let tli_dir_path = tli_dir.path().to_path_buf();
+
+    Ok((tli_dir, tli_dir_path))
+}
+
+/// Do basic validation of a temp timeline, before moving it to the global map.
+pub async fn validate_temp_timeline(
+    conf: &SafeKeeperConf,
+    ttid: TenantTimelineId,
+    path: &Utf8PathBuf,
+) -> Result<(Lsn, Lsn)> {
+    let control_path = path.join("safekeeper.control");
+
+    let control_store = control_file::FileStorage::load_control_file(control_path)?;
+    if control_store.server.wal_seg_size == 0 {
+        bail!("wal_seg_size is not set");
+    }
+
+    let wal_store = wal_storage::PhysicalStorage::new(&ttid, path.clone(), conf, &control_store)?;
+
+    let commit_lsn = control_store.commit_lsn;
+    let flush_lsn = wal_store.flush_lsn();
+
+    Ok((commit_lsn, flush_lsn))
+}
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 6e7da94973..61d7825ae6 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -186,8 +186,14 @@ impl PhysicalStorage {
             "initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, peer_horizon_lsn={}",
             ttid.timeline_id, flush_lsn, state.commit_lsn, state.peer_horizon_lsn,
         );
-        if flush_lsn < state.commit_lsn || flush_lsn < state.peer_horizon_lsn {
-            warn!("timeline {} potential data loss: flush_lsn by find_end_of_wal is less than either commit_lsn or peer_horizon_lsn from control file", ttid.timeline_id);
+        if flush_lsn < state.commit_lsn {
+            bail!("timeline {} potential data loss: flush_lsn {} by find_end_of_wal is less than commit_lsn  {} from control file", ttid.timeline_id, flush_lsn, state.commit_lsn);
+        }
+        if flush_lsn < state.peer_horizon_lsn {
+            warn!(
+                "timeline {}: flush_lsn {} is less than cfile peer_horizon_lsn {}",
+                ttid.timeline_id, flush_lsn, state.peer_horizon_lsn
+            );
         }
 
         Ok(PhysicalStorage {
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index 047b4be8fa..12aa025771 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -59,7 +59,7 @@ impl GlobalMap {
 
             if state.commit_lsn < state.local_start_lsn {
                 bail!(
-                    "commit_lsn {} is higher than local_start_lsn {}",
+                    "commit_lsn {} is smaller than local_start_lsn {}",
                     state.commit_lsn,
                     state.local_start_lsn
                 );
@@ -96,23 +96,7 @@ impl GlobalMap {
         let local_start_lsn = Lsn::INVALID;
 
         let state =
-            TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
-
-        if state.server.wal_seg_size == 0 {
-            bail!(TimelineError::UninitializedWalSegSize(ttid));
-        }
-
-        if state.server.pg_version == UNKNOWN_SERVER_VERSION {
-            bail!(TimelineError::UninitialinzedPgVersion(ttid));
-        }
-
-        if state.commit_lsn < state.local_start_lsn {
-            bail!(
-                "commit_lsn {} is higher than local_start_lsn {}",
-                state.commit_lsn,
-                state.local_start_lsn
-            );
-        }
+            TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?;
 
         let disk_timeline = self.disk.put_state(&ttid, state);
         let control_store = DiskStateStorage::new(disk_timeline.clone());

From b7fa93f6b7ab5d562e0985eb06bdac8ba12ad892 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 22 Oct 2024 09:14:29 -0600
Subject: [PATCH 37/48] Use make's builtin RM variable

At least as far as removing individual files goes, this is the best
pattern for removing. I can't say the same for removing directories, but
I went ahead and changed those to `$(RM) -r` anyway.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 Makefile           | 4 ++--
 compute/Makefile   | 2 +-
 pgxn/neon/Makefile | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 33cfda2661..8e3b755112 100644
--- a/Makefile
+++ b/Makefile
@@ -297,7 +297,7 @@ clean: postgres-clean neon-pg-clean-ext
 # This removes everything
 .PHONY: distclean
 distclean:
-	rm -rf $(POSTGRES_INSTALL_DIR)
+	$(RM) -r $(POSTGRES_INSTALL_DIR)
 	$(CARGO_CMD_PREFIX) cargo clean
 
 .PHONY: fmt
@@ -329,7 +329,7 @@ postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
 		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \
 		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \
 		--excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns
-	rm -f pg*.BAK
+	$(RM) pg*.BAK
 
 # Indent pxgn/neon.
 .PHONY: neon-pgindent
diff --git a/compute/Makefile b/compute/Makefile
index 08e3c7a68b..645880ce70 100644
--- a/compute/Makefile
+++ b/compute/Makefile
@@ -34,7 +34,7 @@ sql_exporter_autoscaling.yml: $(jsonnet_files)
 
 .PHONY: clean
 clean:
-	rm -f \
+	$(RM) \
 		etc/neon_collector.yml \
 		etc/neon_collector_autoscaling.yml \
 		etc/sql_exporter.yml \
diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index f1229b2d73..1503b856f7 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -54,7 +54,7 @@ walproposer-lib: libwalproposer.a;
 
 .PHONY: libwalproposer.a
 libwalproposer.a: $(WALPROP_OBJS)
-	rm -f $@
+	$(RM) $@
 	$(AR) $(AROPT) $@ $^
 
 # needs vars:

From 8dca188974530b3c0c2160b22930615141e0236b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 22 Oct 2024 19:43:02 +0100
Subject: [PATCH 38/48] storage controller: add metrics for tenant shard, node
 count (#9475)

## Problem

Previously, figuring out how many tenant shards were managed by a
storage controller was typically done by peeking at the database or
calling into the API. A metric makes it easier to monitor, as
unexpectedly increasing shard counts can be indicative of problems
elsewhere in the system.

## Summary of changes

- Add metrics `storage_controller_pageserver_nodes` (updated on node
CRUD operations from Service) and `storage_controller_tenant_shards`
(updated RAII-style from TenantShard)
---
 storage_controller/src/metrics.rs             |  6 +++++
 storage_controller/src/service.rs             | 22 ++++++++++++++++---
 storage_controller/src/tenant_shard.rs        | 19 ++++++++++++++++
 .../regress/test_storage_controller.py        |  9 ++++++++
 4 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index 5989aeba91..a1f7bc2457 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -37,6 +37,12 @@ pub(crate) struct StorageControllerMetricGroup {
     /// Count of how many times we spawn a reconcile task
     pub(crate) storage_controller_reconcile_spawn: measured::Counter,
 
+    /// Size of the in-memory map of tenant shards
+    pub(crate) storage_controller_tenant_shards: measured::Gauge,
+
+    /// Size of the in-memory map of pageserver_nodes
+    pub(crate) storage_controller_pageserver_nodes: measured::Gauge,
+
     /// Reconciler tasks completed, broken down by success/failure/cancelled
     pub(crate) storage_controller_reconcile_complete:
         measured::CounterVec<ReconcileCompleteLabelGroupSet>,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 01aa8f1dab..2cde1d6a3d 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -934,7 +934,6 @@ impl Service {
         self.startup_complete.clone().wait().await;
 
         const BACKGROUND_RECONCILE_PERIOD: Duration = Duration::from_secs(20);
-
         let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD);
         while !self.reconcilers_cancel.is_cancelled() {
             tokio::select! {
@@ -1272,6 +1271,10 @@ impl Service {
             .collect::<Vec<_>>();
         let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.get_id(), n)).collect();
         tracing::info!("Loaded {} nodes from database.", nodes.len());
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_pageserver_nodes
+            .set(nodes.len() as i64);
 
         tracing::info!("Loading shards from database...");
         let mut tenant_shard_persistence = persistence.list_tenant_shards().await?;
@@ -4110,9 +4113,9 @@ impl Service {
                     (
                         old_attached,
                         generation,
-                        old_state.policy,
+                        old_state.policy.clone(),
                         old_state.shard,
-                        old_state.config,
+                        old_state.config.clone(),
                     )
                 };
 
@@ -5075,6 +5078,10 @@ impl Service {
         let mut nodes = (*locked.nodes).clone();
         nodes.remove(&node_id);
         locked.nodes = Arc::new(nodes);
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_pageserver_nodes
+            .set(locked.nodes.len() as i64);
 
         locked.scheduler.node_remove(node_id);
 
@@ -5158,6 +5165,10 @@ impl Service {
                     removed_node.set_availability(NodeAvailability::Offline);
                 }
                 *nodes = Arc::new(nodes_mut);
+                metrics::METRICS_REGISTRY
+                    .metrics_group
+                    .storage_controller_pageserver_nodes
+                    .set(nodes.len() as i64);
             }
         }
 
@@ -5346,6 +5357,11 @@ impl Service {
 
         locked.nodes = Arc::new(new_nodes);
 
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_pageserver_nodes
+            .set(locked.nodes.len() as i64);
+
         tracing::info!(
             "Registered pageserver {}, now have {} pageservers",
             register_req.node_id,
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 8a7ff866e6..e696c72ba7 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -473,6 +473,11 @@ impl TenantShard {
         shard: ShardIdentity,
         policy: PlacementPolicy,
     ) -> Self {
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_tenant_shards
+            .inc();
+
         Self {
             tenant_shard_id,
             policy,
@@ -1384,6 +1389,11 @@ impl TenantShard {
         let tenant_shard_id = tsp.get_tenant_shard_id()?;
         let shard_identity = tsp.get_shard_identity()?;
 
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_tenant_shards
+            .inc();
+
         Ok(Self {
             tenant_shard_id,
             shard: shard_identity,
@@ -1512,6 +1522,15 @@ impl TenantShard {
     }
 }
 
+impl Drop for TenantShard {
+    fn drop(&mut self) {
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_tenant_shards
+            .dec();
+    }
+}
+
 #[cfg(test)]
 pub(crate) mod tests {
     use std::{cell::RefCell, rc::Rc};
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index a4e293da9e..d4bc4b1a4f 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -107,6 +107,15 @@ def test_storage_controller_smoke(neon_env_builder: NeonEnvBuilder, combination)
     for tid in tenant_ids:
         env.create_tenant(tid, shard_count=shards_per_tenant)
 
+    # Validate high level metrics
+    assert (
+        env.storage_controller.get_metric_value("storage_controller_tenant_shards")
+        == len(tenant_ids) * shards_per_tenant
+    )
+    assert env.storage_controller.get_metric_value("storage_controller_pageserver_nodes") == len(
+        env.storage_controller.node_list()
+    )
+
     # Repeating a creation should be idempotent (we are just testing it doesn't return an error)
     env.storage_controller.tenant_create(
         tenant_id=next(iter(tenant_ids)), shard_count=shards_per_tenant

From f36cf3f885e9559434f378b45b2e944440e56058 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 22 Oct 2024 21:58:55 +0200
Subject: [PATCH 39/48] Fix local errors for the tests with the versions mix
 (#9477)

## Problem
If the environment variables `COMPATIBILITY_NEON_BIN` or
`COMPATIBILITY_POSTGRES_DISTRIB_DIR` are not set (this is usual during a
local run), the tests with the versions mix cannot run.
## Summary of changes
If these variables are not set turn off the version mix.

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 test_runner/fixtures/utils.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 7ca6b3dd1c..d12fa59abc 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -16,6 +16,7 @@ from typing import TYPE_CHECKING, Any, Callable, TypeVar
 from urllib.parse import urlencode
 
 import allure
+import pytest
 import zstandard
 from psycopg2.extensions import cursor
 from typing_extensions import override
@@ -634,9 +635,27 @@ def allpairs_versions():
     the different versions.
     """
     ids = []
+    argvalues = []
+    compat_not_defined = (
+        os.getenv("COMPATIBILITY_POSTGRES_DISTRIB_DIR") is None
+        or os.getenv("COMPATIBILITY_NEON_BIN") is None
+    )
     for pair in VERSIONS_COMBINATIONS:
         cur_id = []
+        all_new = all(v == "new" for v in pair.values())
         for component in sorted(pair.keys()):
             cur_id.append(pair[component][0])
+        # Adding None if all versions are new, sof no need to mix at all
+        # If COMPATIBILITY_NEON_BIN or COMPATIBILITY_POSTGRES_DISTRIB_DIR are not defined,
+        # we will skip all the tests which include the versions mix.
+        argvalues.append(
+            pytest.param(
+                None if all_new else pair,
+                marks=pytest.mark.skipif(
+                    compat_not_defined and not all_new,
+                    reason="COMPATIBILITY_NEON_BIN or COMPATIBILITY_POSTGRES_DISTRIB_DIR is not set",
+                ),
+            )
+        )
         ids.append(f"combination_{''.join(cur_id)}")
-    return {"argnames": "combination", "argvalues": VERSIONS_COMBINATIONS, "ids": ids}
+    return {"argnames": "combination", "argvalues": tuple(argvalues), "ids": ids}

From fcb55a2aa2a742346e875126a5e0d1cec6663645 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 22 Oct 2024 14:34:26 -0600
Subject: [PATCH 40/48] Fix copy-paste error in checkpoints_timed metric

Importing the wrong metric. Sigh...

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/etc/sql_exporter/checkpoints_timed.libsonnet | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compute/etc/sql_exporter/checkpoints_timed.libsonnet b/compute/etc/sql_exporter/checkpoints_timed.libsonnet
index 0ba0080188..ebe2ddc9f2 100644
--- a/compute/etc/sql_exporter/checkpoints_timed.libsonnet
+++ b/compute/etc/sql_exporter/checkpoints_timed.libsonnet
@@ -1,7 +1,7 @@
 local neon = import 'neon.libsonnet';
 
-local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_req.sql';
-local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_req.17.sql';
+local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_timed.sql';
+local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_timed.17.sql';
 
 {
   metric_name: 'checkpoints_timed',

From 6f8fcdf9ea71599735192d0f60cce80b0cd42405 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 22 Oct 2024 22:52:30 +0200
Subject: [PATCH 41/48] Timeline offloading persistence (#9444)

Persist timeline offloaded state to S3.

Right now, as of #8907, at each restart of the pageserver, all offloaded
state is lost, so we load the full timeline again. As it starts with an
empty local directory, we might potentially download some files again,
leading to downloads that are ultimately wasteful.

This patch adds support for persisting the offloaded state, allowing us
to never load offloaded timelines in the first place. The persistence
feature is facilitated via a new file in S3 that is tenant-global, which
contains a list of all offloaded timelines. It is updated each time we
offload or unoffload a timeline, and otherwise never touched.

This choice means that tenants where no offloading is happening will not
immediately get a manifest, keeping the change very minimal at the
start.

We leave generation support for future work. It is important to support
generations, as in the worst case, the manifest might be overwritten by
an older generation after a timeline has been unoffloaded (and
unarchived), so the next pageserver process instantiation might wrongly
believe that some timeline is still offloaded even though it should be
active.

Part of #9386, #8088
---
 libs/pageserver_api/src/models.rs             |   2 +
 pageserver/src/http/routes.rs                 |   2 +
 pageserver/src/tenant.rs                      | 280 +++++++++++++++---
 .../src/tenant/remote_timeline_client.rs      |  41 ++-
 .../tenant/remote_timeline_client/download.rs |  53 +++-
 .../tenant/remote_timeline_client/manifest.rs |  53 ++++
 .../tenant/remote_timeline_client/upload.rs   |  33 +++
 pageserver/src/tenant/timeline.rs             |   6 +-
 pageserver/src/tenant/timeline/delete.rs      |  69 +++--
 pageserver/src/tenant/timeline/offload.rs     |  53 +++-
 test_runner/regress/test_timeline_archive.py  | 125 +++++++-
 11 files changed, 637 insertions(+), 80 deletions(-)
 create mode 100644 pageserver/src/tenant/remote_timeline_client/manifest.rs

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index e08bf40801..d0ee4b64d1 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -699,6 +699,8 @@ pub struct OffloadedTimelineInfo {
     pub ancestor_timeline_id: Option<TimelineId>,
     /// Whether to retain the branch lsn at the ancestor or not
     pub ancestor_retain_lsn: Option<Lsn>,
+    /// The time point when the timeline was archived
+    pub archived_at: chrono::DateTime<chrono::Utc>,
 }
 
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index a254f1683d..2490bf5f20 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -486,6 +486,7 @@ fn build_timeline_offloaded_info(offloaded: &Arc<OffloadedTimeline>) -> Offloade
         timeline_id,
         ancestor_retain_lsn,
         ancestor_timeline_id,
+        archived_at,
         ..
     } = offloaded.as_ref();
     OffloadedTimelineInfo {
@@ -493,6 +494,7 @@ fn build_timeline_offloaded_info(offloaded: &Arc<OffloadedTimeline>) -> Offloade
         timeline_id,
         ancestor_retain_lsn,
         ancestor_timeline_id,
+        archived_at: archived_at.and_utc(),
     }
 }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 41d21ef041..7a3305797c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -16,6 +16,7 @@ use anyhow::{bail, Context};
 use arc_swap::ArcSwap;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
+use chrono::NaiveDateTime;
 use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
@@ -31,6 +32,10 @@ use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeoutOrCancel;
+use remote_timeline_client::manifest::{
+    OffloadedTimelineManifest, TenantManifest, LATEST_TENANT_MANIFEST_VERSION,
+};
+use remote_timeline_client::UploadQueueNotReadyError;
 use std::collections::BTreeMap;
 use std::fmt;
 use std::future::Future;
@@ -65,13 +70,14 @@ use self::config::TenantConf;
 use self::metadata::TimelineMetadata;
 use self::mgr::GetActiveTenantError;
 use self::mgr::GetTenantError;
-use self::remote_timeline_client::upload::upload_index_part;
+use self::remote_timeline_client::upload::{upload_index_part, upload_tenant_manifest};
 use self::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError};
 use self::timeline::uninit::TimelineCreateGuard;
 use self::timeline::uninit::TimelineExclusionError;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
 use self::timeline::GcCutoffs;
+use self::timeline::TimelineDeleteProgress;
 use self::timeline::TimelineResources;
 use self::timeline::WaitLsnError;
 use crate::config::PageServerConf;
@@ -240,6 +246,7 @@ struct TimelinePreload {
 }
 
 pub(crate) struct TenantPreload {
+    tenant_manifest: TenantManifest,
     timelines: HashMap<TimelineId, TimelinePreload>,
 }
 
@@ -488,6 +495,12 @@ impl WalRedoManager {
     }
 }
 
+/// A very lightweight memory representation of an offloaded timeline.
+///
+/// We need to store the list of offloaded timelines so that we can perform operations on them,
+/// like unoffloading them, or (at a later date), decide to perform flattening.
+/// This type has a much smaller memory impact than [`Timeline`], and thus we can store many
+/// more offloaded timelines than we can manage ones that aren't.
 pub struct OffloadedTimeline {
     pub tenant_shard_id: TenantShardId,
     pub timeline_id: TimelineId,
@@ -495,27 +508,78 @@ pub struct OffloadedTimeline {
     /// Whether to retain the branch lsn at the ancestor or not
     pub ancestor_retain_lsn: Option<Lsn>,
 
-    // TODO: once we persist offloaded state, make this lazily constructed
-    pub remote_client: Arc<RemoteTimelineClient>,
+    /// When the timeline was archived.
+    ///
+    /// Present for future flattening deliberations.
+    pub archived_at: NaiveDateTime,
+
+    /// Lazily constructed remote client for the timeline
+    ///
+    /// If we offload a timeline, we keep around the remote client
+    /// for the duration of the process. If we find it through the
+    /// manifest, we don't construct it up until it's needed (deletion).
+    pub remote_client: Option<Arc<RemoteTimelineClient>>,
 
     /// Prevent two tasks from deleting the timeline at the same time. If held, the
     /// timeline is being deleted. If 'true', the timeline has already been deleted.
-    pub delete_progress: Arc<tokio::sync::Mutex<DeleteTimelineFlow>>,
+    pub delete_progress: TimelineDeleteProgress,
 }
 
 impl OffloadedTimeline {
-    fn from_timeline(timeline: &Timeline) -> Self {
+    /// Obtains an offloaded timeline from a given timeline object.
+    ///
+    /// Returns `None` if the `archived_at` flag couldn't be obtained, i.e.
+    /// the timeline is not in a stopped state.
+    /// Panics if the timeline is not archived.
+    fn from_timeline(timeline: &Timeline) -> Result<Self, UploadQueueNotReadyError> {
         let ancestor_retain_lsn = timeline
             .get_ancestor_timeline_id()
             .map(|_timeline_id| timeline.get_ancestor_lsn());
-        Self {
+        let archived_at = timeline
+            .remote_client
+            .archived_at_stopped_queue()?
+            .expect("must be called on an archived timeline");
+        Ok(Self {
             tenant_shard_id: timeline.tenant_shard_id,
             timeline_id: timeline.timeline_id,
             ancestor_timeline_id: timeline.get_ancestor_timeline_id(),
             ancestor_retain_lsn,
+            archived_at,
 
-            remote_client: timeline.remote_client.clone(),
+            remote_client: Some(timeline.remote_client.clone()),
             delete_progress: timeline.delete_progress.clone(),
+        })
+    }
+    fn from_manifest(tenant_shard_id: TenantShardId, manifest: &OffloadedTimelineManifest) -> Self {
+        let OffloadedTimelineManifest {
+            timeline_id,
+            ancestor_timeline_id,
+            ancestor_retain_lsn,
+            archived_at,
+        } = *manifest;
+        Self {
+            tenant_shard_id,
+            timeline_id,
+            ancestor_timeline_id,
+            ancestor_retain_lsn,
+            archived_at,
+            remote_client: None,
+            delete_progress: TimelineDeleteProgress::default(),
+        }
+    }
+    fn manifest(&self) -> OffloadedTimelineManifest {
+        let Self {
+            timeline_id,
+            ancestor_timeline_id,
+            ancestor_retain_lsn,
+            archived_at,
+            ..
+        } = self;
+        OffloadedTimelineManifest {
+            timeline_id: *timeline_id,
+            ancestor_timeline_id: *ancestor_timeline_id,
+            ancestor_retain_lsn: *ancestor_retain_lsn,
+            archived_at: *archived_at,
         }
     }
 }
@@ -551,10 +615,19 @@ impl TimelineOrOffloaded {
             TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.delete_progress,
         }
     }
-    pub fn remote_client(&self) -> &Arc<RemoteTimelineClient> {
+    pub fn remote_client_maybe_construct(&self, tenant: &Tenant) -> Arc<RemoteTimelineClient> {
         match self {
-            TimelineOrOffloaded::Timeline(timeline) => &timeline.remote_client,
-            TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.remote_client,
+            TimelineOrOffloaded::Timeline(timeline) => timeline.remote_client.clone(),
+            TimelineOrOffloaded::Offloaded(offloaded) => match offloaded.remote_client.clone() {
+                Some(remote_client) => remote_client,
+                None => {
+                    let remote_client = tenant.build_timeline_client(
+                        offloaded.timeline_id,
+                        tenant.remote_storage.clone(),
+                    );
+                    Arc::new(remote_client)
+                }
+            },
         }
     }
 }
@@ -1131,14 +1204,35 @@ impl Tenant {
             cancel.clone(),
         )
         .await?;
+        let (offloaded_add, tenant_manifest) =
+            match remote_timeline_client::do_download_tenant_manifest(
+                remote_storage,
+                &self.tenant_shard_id,
+                &cancel,
+            )
+            .await
+            {
+                Ok((tenant_manifest, _generation)) => (
+                    format!("{} offloaded", tenant_manifest.offloaded_timelines.len()),
+                    tenant_manifest,
+                ),
+                Err(DownloadError::NotFound) => {
+                    ("no manifest".to_string(), TenantManifest::empty())
+                }
+                Err(e) => Err(e)?,
+            };
 
-        info!("found {} timelines", remote_timeline_ids.len(),);
+        info!(
+            "found {} timelines, and {offloaded_add}",
+            remote_timeline_ids.len()
+        );
 
         for k in other_keys {
             warn!("Unexpected non timeline key {k}");
         }
 
         Ok(TenantPreload {
+            tenant_manifest,
             timelines: self
                 .load_timelines_metadata(remote_timeline_ids, remote_storage, cancel)
                 .await?,
@@ -1163,12 +1257,26 @@ impl Tenant {
             anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
         };
 
+        let mut offloaded_timeline_ids = HashSet::new();
+        let mut offloaded_timelines_list = Vec::new();
+        for timeline_manifest in preload.tenant_manifest.offloaded_timelines.iter() {
+            let timeline_id = timeline_manifest.timeline_id;
+            let offloaded_timeline =
+                OffloadedTimeline::from_manifest(self.tenant_shard_id, timeline_manifest);
+            offloaded_timelines_list.push((timeline_id, Arc::new(offloaded_timeline)));
+            offloaded_timeline_ids.insert(timeline_id);
+        }
+
         let mut timelines_to_resume_deletions = vec![];
 
         let mut remote_index_and_client = HashMap::new();
         let mut timeline_ancestors = HashMap::new();
         let mut existent_timelines = HashSet::new();
         for (timeline_id, preload) in preload.timelines {
+            if offloaded_timeline_ids.remove(&timeline_id) {
+                // The timeline is offloaded, skip loading it.
+                continue;
+            }
             let index_part = match preload.index_part {
                 Ok(i) => {
                     debug!("remote index part exists for timeline {timeline_id}");
@@ -1272,6 +1380,43 @@ impl Tenant {
             .context("resume_deletion")
             .map_err(LoadLocalTimelineError::ResumeDeletion)?;
         }
+        // Complete deletions for offloaded timeline id's.
+        offloaded_timelines_list
+            .retain(|(offloaded_id, _offloaded)| {
+                // At this point, offloaded_timeline_ids has the list of all offloaded timelines
+                // without a prefix in S3, so they are inexistent.
+                // In the end, existence of a timeline is finally determined by the existence of an index-part.json in remote storage.
+                // If there is a dangling reference in another location, they need to be cleaned up.
+                let delete = offloaded_timeline_ids.contains(offloaded_id);
+                if delete {
+                    tracing::info!("Removing offloaded timeline {offloaded_id} from manifest as no remote prefix was found");
+                }
+                !delete
+        });
+        if !offloaded_timelines_list.is_empty() {
+            tracing::info!(
+                "Tenant has {} offloaded timelines",
+                offloaded_timelines_list.len()
+            );
+        }
+        {
+            let mut offloaded_timelines_accessor = self.timelines_offloaded.lock().unwrap();
+            offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter());
+        }
+        if !offloaded_timeline_ids.is_empty() {
+            let manifest = self.tenant_manifest();
+            // TODO: generation support
+            let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
+            upload_tenant_manifest(
+                &self.remote_storage,
+                &self.tenant_shard_id,
+                generation,
+                &manifest,
+                &self.cancel,
+            )
+            .await
+            .map_err(TimelineArchivalError::Other)?;
+        }
 
         // The local filesystem contents are a cache of what's in the remote IndexPart;
         // IndexPart is the source of truth.
@@ -1443,20 +1588,28 @@ impl Tenant {
         Ok(timeline_preloads)
     }
 
-    fn load_timeline_metadata(
-        self: &Arc<Tenant>,
+    fn build_timeline_client(
+        &self,
         timeline_id: TimelineId,
         remote_storage: GenericRemoteStorage,
-        cancel: CancellationToken,
-    ) -> impl Future<Output = TimelinePreload> {
-        let client = RemoteTimelineClient::new(
+    ) -> RemoteTimelineClient {
+        RemoteTimelineClient::new(
             remote_storage.clone(),
             self.deletion_queue_client.clone(),
             self.conf,
             self.tenant_shard_id,
             timeline_id,
             self.generation,
-        );
+        )
+    }
+
+    fn load_timeline_metadata(
+        self: &Arc<Tenant>,
+        timeline_id: TimelineId,
+        remote_storage: GenericRemoteStorage,
+        cancel: CancellationToken,
+    ) -> impl Future<Output = TimelinePreload> {
+        let client = self.build_timeline_client(timeline_id, remote_storage);
         async move {
             debug_assert_current_span_has_tenant_and_timeline_id();
             debug!("starting index part download");
@@ -1547,7 +1700,7 @@ impl Tenant {
         info!("unoffloading timeline");
         let cancel = self.cancel.clone();
         let timeline_preload = self
-            .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel)
+            .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel.clone())
             .await;
 
         let index_part = match timeline_preload.index_part {
@@ -1592,17 +1745,37 @@ impl Tenant {
             )
         })
         .map_err(TimelineArchivalError::Other)?;
-        let timelines = self.timelines.lock().unwrap();
-        let Some(timeline) = timelines.get(&timeline_id) else {
-            warn!("timeline not available directly after attach");
-            return Err(TimelineArchivalError::Other(anyhow::anyhow!(
-                "timeline not available directly after attach"
-            )));
+
+        let timeline = {
+            let timelines = self.timelines.lock().unwrap();
+            let Some(timeline) = timelines.get(&timeline_id) else {
+                warn!("timeline not available directly after attach");
+                // This is not a panic because no locks are held between `load_remote_timeline`
+                // which puts the timeline into timelines, and our look into the timeline map.
+                return Err(TimelineArchivalError::Other(anyhow::anyhow!(
+                    "timeline not available directly after attach"
+                )));
+            };
+            let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap();
+            if offloaded_timelines.remove(&timeline_id).is_none() {
+                warn!("timeline already removed from offloaded timelines");
+            }
+            Arc::clone(timeline)
         };
-        let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap();
-        if offloaded_timelines.remove(&timeline_id).is_none() {
-            warn!("timeline already removed from offloaded timelines");
-        }
+
+        // Upload new list of offloaded timelines to S3
+        let manifest = self.tenant_manifest();
+        // TODO: generation support
+        let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
+        upload_tenant_manifest(
+            &self.remote_storage,
+            &self.tenant_shard_id,
+            generation,
+            &manifest,
+            &cancel,
+        )
+        .await
+        .map_err(TimelineArchivalError::Other)?;
 
         // Activate the timeline (if it makes sense)
         if !(timeline.is_broken() || timeline.is_stopping()) {
@@ -1616,7 +1789,7 @@ impl Tenant {
         }
 
         info!("timeline unoffloading complete");
-        Ok(Arc::clone(timeline))
+        Ok(timeline)
     }
 
     pub(crate) async fn apply_timeline_archival_config(
@@ -2793,6 +2966,26 @@ impl Tenant {
             }
         }
 
+        // TODO: also copy index files of offloaded timelines
+
+        let tenant_manifest = self.tenant_manifest();
+        // TODO: generation support
+        let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
+        for child_shard in child_shards {
+            tracing::info!(
+                "Uploading tenant manifest for child {}",
+                child_shard.to_index()
+            );
+            upload_tenant_manifest(
+                &self.remote_storage,
+                child_shard,
+                generation,
+                &tenant_manifest,
+                &self.cancel,
+            )
+            .await?;
+        }
+
         Ok(())
     }
 
@@ -2970,6 +3163,22 @@ impl Tenant {
             .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
     }
 
+    pub(crate) fn tenant_manifest(&self) -> TenantManifest {
+        let timelines_offloaded = self.timelines_offloaded.lock().unwrap();
+
+        let mut timeline_manifests = timelines_offloaded
+            .iter()
+            .map(|(_timeline_id, offloaded)| offloaded.manifest())
+            .collect::<Vec<_>>();
+        // Sort the manifests so that our output is deterministic
+        timeline_manifests.sort_by_key(|timeline_manifest| timeline_manifest.timeline_id);
+
+        TenantManifest {
+            version: LATEST_TENANT_MANIFEST_VERSION,
+            offloaded_timelines: timeline_manifests,
+        }
+    }
+
     pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
         // Use read-copy-update in order to avoid overwriting the location config
         // state if this races with [`Tenant::set_new_location_config`]. Note that
@@ -3962,18 +4171,21 @@ impl Tenant {
         Ok(timeline)
     }
 
-    /// Call this before constructing a timeline, to build its required structures
-    fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources {
-        let remote_client = RemoteTimelineClient::new(
+    fn build_timeline_remote_client(&self, timeline_id: TimelineId) -> RemoteTimelineClient {
+        RemoteTimelineClient::new(
             self.remote_storage.clone(),
             self.deletion_queue_client.clone(),
             self.conf,
             self.tenant_shard_id,
             timeline_id,
             self.generation,
-        );
+        )
+    }
+
+    /// Call this before constructing a timeline, to build its required structures
+    fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources {
         TimelineResources {
-            remote_client,
+            remote_client: self.build_timeline_remote_client(timeline_id),
             timeline_get_throttle: self.timeline_get_throttle.clone(),
             l0_flush_global_state: self.l0_flush_global_state.clone(),
         }
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 14b894d17c..066fd12a9a 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -180,6 +180,7 @@
 
 pub(crate) mod download;
 pub mod index;
+pub mod manifest;
 pub(crate) mod upload;
 
 use anyhow::Context;
@@ -191,7 +192,6 @@ use pageserver_api::models::TimelineArchivalState;
 use pageserver_api::shard::{ShardIndex, TenantShardId};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
-pub(crate) use upload::upload_initdb_dir;
 use utils::backoff::{
     self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
@@ -245,9 +245,11 @@ use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
 use super::Generation;
 
 pub(crate) use download::{
-    download_index_part, is_temp_download_file, list_remote_tenant_shards, list_remote_timelines,
+    do_download_tenant_manifest, download_index_part, is_temp_download_file,
+    list_remote_tenant_shards, list_remote_timelines,
 };
 pub(crate) use index::LayerFileMetadata;
+pub(crate) use upload::{upload_initdb_dir, upload_tenant_manifest};
 
 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a download fails, we log it at info-level, and retry.
@@ -272,6 +274,12 @@ pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
 /// which we warn and skip.
 const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10);
 
+/// Hardcode a generation for the tenant manifest for now so that we don't
+/// need to deal with generation-less manifests in the future.
+///
+/// TODO: add proper generation support to all the places that use this.
+pub(crate) const TENANT_MANIFEST_GENERATION: Generation = Generation::new(1);
+
 pub enum MaybeDeletedIndexPart {
     IndexPart(IndexPart),
     Deleted(IndexPart),
@@ -295,6 +303,10 @@ pub enum WaitCompletionError {
     UploadQueueShutDownOrStopped,
 }
 
+#[derive(Debug, thiserror::Error)]
+#[error("Upload queue either in unexpected state or hasn't downloaded manifest yet")]
+pub struct UploadQueueNotReadyError;
+
 /// A client for accessing a timeline's data in remote storage.
 ///
 /// This takes care of managing the number of connections, and balancing them
@@ -468,6 +480,20 @@ impl RemoteTimelineClient {
             .ok()
     }
 
+    /// Returns `Ok(Some(timestamp))` if the timeline has been archived, `Ok(None)` if the timeline hasn't been archived.
+    ///
+    /// Return Err(_) if the remote index_part hasn't been downloaded yet, or the timeline hasn't been stopped yet.
+    pub(crate) fn archived_at_stopped_queue(
+        &self,
+    ) -> Result<Option<NaiveDateTime>, UploadQueueNotReadyError> {
+        self.upload_queue
+            .lock()
+            .unwrap()
+            .stopped_mut()
+            .map(|q| q.upload_queue_for_deletion.clean.0.archived_at)
+            .map_err(|_| UploadQueueNotReadyError)
+    }
+
     fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
         let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
             current_remote_index_part
@@ -2198,6 +2224,17 @@ pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath {
     RemotePath::from_string(&path).expect("Failed to construct path")
 }
 
+pub fn remote_tenant_manifest_path(
+    tenant_shard_id: &TenantShardId,
+    generation: Generation,
+) -> RemotePath {
+    let path = format!(
+        "tenants/{tenant_shard_id}/tenant-manifest{}.json",
+        generation.get_suffix()
+    );
+    RemotePath::from_string(&path).expect("Failed to construct path")
+}
+
 pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
     let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}");
     RemotePath::from_string(&path).expect("Failed to construct path")
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index b5d4b0f0bb..95f8f026d4 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -34,10 +34,11 @@ use utils::id::{TenantId, TimelineId};
 use utils::pausable_failpoint;
 
 use super::index::{IndexPart, LayerFileMetadata};
+use super::manifest::TenantManifest;
 use super::{
     parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
-    remote_initdb_preserved_archive_path, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
-    FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
+    remote_initdb_preserved_archive_path, remote_tenant_manifest_path, remote_tenant_path,
+    FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
 };
 
 ///
@@ -338,19 +339,15 @@ pub async fn list_remote_timelines(
     list_identifiers::<TimelineId>(storage, remote_path, cancel).await
 }
 
-async fn do_download_index_part(
+async fn do_download_remote_path_retry_forever(
     storage: &GenericRemoteStorage,
-    tenant_shard_id: &TenantShardId,
-    timeline_id: &TimelineId,
-    index_generation: Generation,
+    remote_path: &RemotePath,
     cancel: &CancellationToken,
-) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
-    let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
-
-    let (index_part_bytes, index_part_mtime) = download_retry_forever(
+) -> Result<(Vec<u8>, SystemTime), DownloadError> {
+    download_retry_forever(
         || async {
             let download = storage
-                .download(&remote_path, &DownloadOpts::default(), cancel)
+                .download(remote_path, &DownloadOpts::default(), cancel)
                 .await?;
 
             let mut bytes = Vec::new();
@@ -365,7 +362,39 @@ async fn do_download_index_part(
         &format!("download {remote_path:?}"),
         cancel,
     )
-    .await?;
+    .await
+}
+
+pub async fn do_download_tenant_manifest(
+    storage: &GenericRemoteStorage,
+    tenant_shard_id: &TenantShardId,
+    cancel: &CancellationToken,
+) -> Result<(TenantManifest, Generation), DownloadError> {
+    // TODO: generation support
+    let generation = super::TENANT_MANIFEST_GENERATION;
+    let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
+
+    let (manifest_bytes, _manifest_bytes_mtime) =
+        do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?;
+
+    let tenant_manifest = TenantManifest::from_json_bytes(&manifest_bytes)
+        .with_context(|| format!("deserialize tenant manifest file at {remote_path:?}"))
+        .map_err(DownloadError::Other)?;
+
+    Ok((tenant_manifest, generation))
+}
+
+async fn do_download_index_part(
+    storage: &GenericRemoteStorage,
+    tenant_shard_id: &TenantShardId,
+    timeline_id: &TimelineId,
+    index_generation: Generation,
+    cancel: &CancellationToken,
+) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
+    let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
+
+    let (index_part_bytes, index_part_mtime) =
+        do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?;
 
     let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
         .with_context(|| format!("deserialize index part file at {remote_path:?}"))
diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs
new file mode 100644
index 0000000000..7d92d45146
--- /dev/null
+++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs
@@ -0,0 +1,53 @@
+use chrono::NaiveDateTime;
+use serde::{Deserialize, Serialize};
+use utils::{id::TimelineId, lsn::Lsn};
+
+/// Tenant-shard scoped manifest
+#[derive(Clone, Serialize, Deserialize)]
+pub struct TenantManifest {
+    /// Debugging aid describing the version of this manifest.
+    /// Can also be used for distinguishing breaking changes later on.
+    pub version: usize,
+
+    /// The list of offloaded timelines together with enough information
+    /// to not have to actually load them.
+    ///
+    /// Note: the timelines mentioned in this list might be deleted, i.e.
+    /// we don't hold an invariant that the references aren't dangling.
+    /// Existence of index-part.json is the actual indicator of timeline existence.
+    pub offloaded_timelines: Vec<OffloadedTimelineManifest>,
+}
+
+/// The remote level representation of an offloaded timeline.
+///
+/// Very similar to [`pageserver_api::models::OffloadedTimelineInfo`],
+/// but the two datastructures serve different needs, this is for a persistent disk format
+/// that must be backwards compatible, while the other is only for informative purposes.
+#[derive(Clone, Serialize, Deserialize, Copy)]
+pub struct OffloadedTimelineManifest {
+    pub timeline_id: TimelineId,
+    /// Whether the timeline has a parent it has been branched off from or not
+    pub ancestor_timeline_id: Option<TimelineId>,
+    /// Whether to retain the branch lsn at the ancestor or not
+    pub ancestor_retain_lsn: Option<Lsn>,
+    /// The time point when the timeline was archived
+    pub archived_at: NaiveDateTime,
+}
+
+pub const LATEST_TENANT_MANIFEST_VERSION: usize = 1;
+
+impl TenantManifest {
+    pub(crate) fn empty() -> Self {
+        Self {
+            version: LATEST_TENANT_MANIFEST_VERSION,
+            offloaded_timelines: vec![],
+        }
+    }
+    pub(crate) fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
+        serde_json::from_slice::<Self>(bytes)
+    }
+
+    pub(crate) fn to_json_bytes(&self) -> serde_json::Result<Vec<u8>> {
+        serde_json::to_vec(self)
+    }
+}
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index c4dd184610..5a2b7bd08f 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -13,9 +13,11 @@ use tokio_util::sync::CancellationToken;
 use utils::{backoff, pausable_failpoint};
 
 use super::index::IndexPart;
+use super::manifest::TenantManifest;
 use super::Generation;
 use crate::tenant::remote_timeline_client::{
     remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path,
+    remote_tenant_manifest_path,
 };
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError};
 use utils::id::{TenantId, TimelineId};
@@ -55,6 +57,37 @@ pub(crate) async fn upload_index_part<'a>(
         .await
         .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
 }
+/// Serializes and uploads the given tenant manifest data to the remote storage.
+pub(crate) async fn upload_tenant_manifest(
+    storage: &GenericRemoteStorage,
+    tenant_shard_id: &TenantShardId,
+    generation: Generation,
+    tenant_manifest: &TenantManifest,
+    cancel: &CancellationToken,
+) -> anyhow::Result<()> {
+    tracing::trace!("uploading new tenant manifest");
+
+    fail_point!("before-upload-manifest", |_| {
+        bail!("failpoint before-upload-manifest")
+    });
+    pausable_failpoint!("before-upload-manifest-pausable");
+
+    let serialized = tenant_manifest.to_json_bytes()?;
+    let serialized = Bytes::from(serialized);
+
+    let tenant_manifest_site = serialized.len();
+
+    let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
+    storage
+        .upload_storage_object(
+            futures::stream::once(futures::future::ready(Ok(serialized))),
+            tenant_manifest_site,
+            &remote_path,
+            cancel,
+        )
+        .await
+        .with_context(|| format!("upload tenant manifest for '{tenant_shard_id}'"))
+}
 
 /// Attempts to upload given layer files.
 /// No extra checks for overlapping files is made and any files that are already present remotely will be overwritten, if submitted during the upload.
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d67a139dfa..d5ceec663b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -371,7 +371,7 @@ pub struct Timeline {
 
     /// Prevent two tasks from deleting the timeline at the same time. If held, the
     /// timeline is being deleted. If 'true', the timeline has already been deleted.
-    pub delete_progress: Arc<tokio::sync::Mutex<DeleteTimelineFlow>>,
+    pub delete_progress: TimelineDeleteProgress,
 
     eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
 
@@ -426,6 +426,8 @@ pub struct Timeline {
     pub(crate) attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
 }
 
+pub type TimelineDeleteProgress = Arc<tokio::sync::Mutex<DeleteTimelineFlow>>;
+
 pub struct WalReceiverInfo {
     pub wal_source_connconf: PgConnectionConfig,
     pub last_received_msg_lsn: Lsn,
@@ -2250,7 +2252,7 @@ impl Timeline {
                 eviction_task_timeline_state: tokio::sync::Mutex::new(
                     EvictionTaskTimelineState::default(),
                 ),
-                delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())),
+                delete_progress: TimelineDeleteProgress::default(),
 
                 cancel,
                 gate: Gate::default(),
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 71b9e4e288..4799aab436 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -14,7 +14,9 @@ use crate::{
     task_mgr::{self, TaskKind},
     tenant::{
         metadata::TimelineMetadata,
-        remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
+        remote_timeline_client::{
+            self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
+        },
         CreateTimelineCause, DeleteTimelineError, Tenant, TimelineOrOffloaded,
     },
 };
@@ -25,12 +27,9 @@ use super::{Timeline, TimelineResources};
 /// during attach or pageserver restart.
 /// See comment in persist_index_part_with_deleted_flag.
 async fn set_deleted_in_remote_index(
-    timeline: &TimelineOrOffloaded,
+    remote_client: &Arc<RemoteTimelineClient>,
 ) -> Result<(), DeleteTimelineError> {
-    let res = timeline
-        .remote_client()
-        .persist_index_part_with_deleted_flag()
-        .await;
+    let res = remote_client.persist_index_part_with_deleted_flag().await;
     match res {
         // If we (now, or already) marked it successfully as deleted, we can proceed
         Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
@@ -129,12 +128,10 @@ pub(super) async fn delete_local_timeline_directory(
 }
 
 /// Removes remote layers and an index file after them.
-async fn delete_remote_layers_and_index(timeline: &TimelineOrOffloaded) -> anyhow::Result<()> {
-    timeline
-        .remote_client()
-        .delete_all()
-        .await
-        .context("delete_all")
+async fn delete_remote_layers_and_index(
+    remote_client: &Arc<RemoteTimelineClient>,
+) -> anyhow::Result<()> {
+    remote_client.delete_all().await.context("delete_all")
 }
 
 /// It is important that this gets called when DeletionGuard is being held.
@@ -179,6 +176,32 @@ async fn remove_maybe_offloaded_timeline_from_tenant(
     Ok(())
 }
 
+/// It is important that this gets called when DeletionGuard is being held.
+/// For more context see comments in [`DeleteTimelineFlow::prepare`]
+async fn upload_new_tenant_manifest(
+    tenant: &Tenant,
+    _: &DeletionGuard, // using it as a witness
+) -> anyhow::Result<()> {
+    // This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash
+    // between the deletion of the index-part.json and reaching of this code.
+    // So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
+    // However, we handle this case in tenant loading code so the next time we attach, the issue is
+    // resolved.
+    let manifest = tenant.tenant_manifest();
+    // TODO: generation support
+    let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
+    remote_timeline_client::upload_tenant_manifest(
+        &tenant.remote_storage,
+        &tenant.tenant_shard_id,
+        generation,
+        &manifest,
+        &tenant.cancel,
+    )
+    .await?;
+
+    Ok(())
+}
+
 /// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
 /// and deletes its data from both disk and s3.
 /// The sequence of steps:
@@ -235,7 +258,8 @@ impl DeleteTimelineFlow {
             ))?
         });
 
-        set_deleted_in_remote_index(&timeline).await?;
+        let remote_client = timeline.remote_client_maybe_construct(tenant);
+        set_deleted_in_remote_index(&remote_client).await?;
 
         fail::fail_point!("timeline-delete-before-schedule", |_| {
             Err(anyhow::anyhow!(
@@ -243,7 +267,13 @@ impl DeleteTimelineFlow {
             ))?
         });
 
-        Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
+        Self::schedule_background(
+            guard,
+            tenant.conf,
+            Arc::clone(tenant),
+            timeline,
+            remote_client,
+        );
 
         Ok(())
     }
@@ -301,8 +331,9 @@ impl DeleteTimelineFlow {
 
         guard.mark_in_progress()?;
 
+        let remote_client = timeline.remote_client.clone();
         let timeline = TimelineOrOffloaded::Timeline(timeline);
-        Self::schedule_background(guard, tenant.conf, tenant, timeline);
+        Self::schedule_background(guard, tenant.conf, tenant, timeline, remote_client);
 
         Ok(())
     }
@@ -380,6 +411,7 @@ impl DeleteTimelineFlow {
         conf: &'static PageServerConf,
         tenant: Arc<Tenant>,
         timeline: TimelineOrOffloaded,
+        remote_client: Arc<RemoteTimelineClient>,
     ) {
         let tenant_shard_id = timeline.tenant_shard_id();
         let timeline_id = timeline.timeline_id();
@@ -391,7 +423,7 @@ impl DeleteTimelineFlow {
             Some(timeline_id),
             "timeline_delete",
             async move {
-                if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
+                if let Err(err) = Self::background(guard, conf, &tenant, &timeline, remote_client).await {
                     error!("Error: {err:#}");
                     if let TimelineOrOffloaded::Timeline(timeline) = timeline {
                         timeline.set_broken(format!("{err:#}"))
@@ -408,6 +440,7 @@ impl DeleteTimelineFlow {
         conf: &PageServerConf,
         tenant: &Tenant,
         timeline: &TimelineOrOffloaded,
+        remote_client: Arc<RemoteTimelineClient>,
     ) -> Result<(), DeleteTimelineError> {
         // Offloaded timelines have no local state
         // TODO: once we persist offloaded information, delete the timeline from there, too
@@ -415,12 +448,14 @@ impl DeleteTimelineFlow {
             delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?;
         }
 
-        delete_remote_layers_and_index(timeline).await?;
+        delete_remote_layers_and_index(&remote_client).await?;
 
         pausable_failpoint!("in_progress_delete");
 
         remove_maybe_offloaded_timeline_from_tenant(tenant, timeline, &guard).await?;
 
+        upload_new_tenant_manifest(tenant, &guard).await?;
+
         *guard = Self::Finished;
 
         Ok(())
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index 7e6084baaf..8e6eceb084 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -1,17 +1,17 @@
 use std::sync::Arc;
 
-use crate::tenant::{OffloadedTimeline, Tenant, TimelineOrOffloaded};
-
-use super::{
-    delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard},
-    Timeline,
-};
+use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
+use super::Timeline;
+use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::tenant::{remote_timeline_client, OffloadedTimeline, Tenant, TimelineOrOffloaded};
 
 pub(crate) async fn offload_timeline(
     tenant: &Tenant,
     timeline: &Arc<Timeline>,
 ) -> anyhow::Result<()> {
+    debug_assert_current_span_has_tenant_and_timeline_id();
     tracing::info!("offloading archived timeline");
+
     let (timeline, guard) = DeleteTimelineFlow::prepare(tenant, timeline.timeline_id)?;
 
     let TimelineOrOffloaded::Timeline(timeline) = timeline else {
@@ -19,14 +19,28 @@ pub(crate) async fn offload_timeline(
         return Ok(());
     };
 
+    let is_archived = timeline.is_archived();
+    match is_archived {
+        Some(true) => (),
+        Some(false) => {
+            tracing::warn!(?is_archived, "tried offloading a non-archived timeline");
+            anyhow::bail!("timeline isn't archived");
+        }
+        None => {
+            tracing::warn!(
+                ?is_archived,
+                "tried offloading a timeline where manifest is not yet available"
+            );
+            anyhow::bail!("timeline manifest hasn't been loaded yet");
+        }
+    }
+
     // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
     timeline.shutdown(super::ShutdownMode::Hard).await;
 
     // TODO extend guard mechanism above with method
     // to make deletions possible while offloading is in progress
 
-    // TODO mark timeline as offloaded in S3
-
     let conf = &tenant.conf;
     delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await?;
 
@@ -36,10 +50,31 @@ pub(crate) async fn offload_timeline(
         let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
         offloaded_timelines.insert(
             timeline.timeline_id,
-            Arc::new(OffloadedTimeline::from_timeline(&timeline)),
+            Arc::new(
+                OffloadedTimeline::from_timeline(&timeline)
+                    .expect("we checked above that timeline was ready"),
+            ),
         );
     }
 
+    // Last step: mark timeline as offloaded in S3
+    // TODO: maybe move this step above, right above deletion of the local timeline directory,
+    // then there is no potential race condition where we partially offload a timeline, and
+    // at the next restart attach it again.
+    // For that to happen, we'd need to make the manifest reflect our *intended* state,
+    // not our actual state of offloaded timelines.
+    let manifest = tenant.tenant_manifest();
+    // TODO: generation support
+    let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
+    remote_timeline_client::upload_tenant_manifest(
+        &tenant.remote_storage,
+        &tenant.tenant_shard_id,
+        generation,
+        &manifest,
+        &tenant.cancel,
+    )
+    .await?;
+
     Ok(())
 }
 
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index 85e1077fd5..cb8724dd1c 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -4,8 +4,11 @@ import pytest
 from fixtures.common_types import TenantId, TimelineArchivalState, TimelineId
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
+    last_flush_lsn_upload,
 )
 from fixtures.pageserver.http import PageserverApiException
+from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty
+from fixtures.remote_storage import s3_storage
 from fixtures.utils import wait_until
 
 
@@ -168,7 +171,7 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
         state=TimelineArchivalState.ARCHIVED,
     )
 
-    def timeline_offloaded(timeline_id: TimelineId) -> bool:
+    def timeline_offloaded_logged(timeline_id: TimelineId) -> bool:
         return (
             env.pageserver.log_contains(f".*{timeline_id}.* offloading archived timeline.*")
             is not None
@@ -186,12 +189,12 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
     def parent_offloaded():
         if manual_offload:
             ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=parent_timeline_id)
-        assert timeline_offloaded(parent_timeline_id)
+        assert timeline_offloaded_logged(parent_timeline_id)
 
     def leaf_offloaded():
         if manual_offload:
             ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=leaf_timeline_id)
-        assert timeline_offloaded(leaf_timeline_id)
+        assert timeline_offloaded_logged(leaf_timeline_id)
 
     wait_until(30, 1, leaf_offloaded)
     wait_until(30, 1, parent_offloaded)
@@ -218,4 +221,118 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
         sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key > 50")
         assert sum == sum_again
 
-    assert not timeline_offloaded(initial_timeline_id)
+    assert not timeline_offloaded_logged(initial_timeline_id)
+
+
+def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder):
+    """
+    Test for persistence of timeline offload state
+    """
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    env = neon_env_builder.init_start()
+    ps_http = env.pageserver.http_client()
+
+    # Turn off gc and compaction loops: we want to issue them manually for better reliability
+    tenant_id, root_timeline_id = env.create_tenant(
+        conf={
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            "checkpoint_distance": f"{1024 ** 2}",
+        }
+    )
+
+    # Create a branch and archive it
+    child_timeline_id = env.create_branch("test_archived_branch_persisted", tenant_id)
+
+    with env.endpoints.create_start(
+        "test_archived_branch_persisted", tenant_id=tenant_id
+    ) as endpoint:
+        endpoint.safe_psql_many(
+            [
+                "CREATE TABLE foo(key serial primary key, t text default 'data_content')",
+                "INSERT INTO foo SELECT FROM generate_series(1,2048)",
+            ]
+        )
+        sum = endpoint.safe_psql("SELECT sum(key) from foo where key < 500")
+        last_flush_lsn_upload(env, endpoint, tenant_id, child_timeline_id)
+
+    assert_prefix_not_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix=f"tenants/{str(tenant_id)}/",
+    )
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix=f"tenants/{str(tenant_id)}/tenant-manifest",
+    )
+
+    ps_http.timeline_archival_config(
+        tenant_id,
+        child_timeline_id,
+        state=TimelineArchivalState.ARCHIVED,
+    )
+    leaf_detail = ps_http.timeline_detail(
+        tenant_id,
+        child_timeline_id,
+    )
+    assert leaf_detail["is_archived"] is True
+
+    def timeline_offloaded_api(timeline_id: TimelineId) -> bool:
+        # TODO add a proper API to check if a timeline has been offloaded or not
+        return not any(
+            timeline["timeline_id"] == str(timeline_id)
+            for timeline in ps_http.timeline_list(tenant_id=tenant_id)
+        )
+
+    def child_offloaded():
+        ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=child_timeline_id)
+        assert timeline_offloaded_api(child_timeline_id)
+
+    wait_until(30, 1, child_offloaded)
+
+    assert timeline_offloaded_api(child_timeline_id)
+    assert not timeline_offloaded_api(root_timeline_id)
+
+    assert_prefix_not_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix=f"tenants/{str(tenant_id)}/tenant-manifest",
+    )
+
+    # Test persistence, is the timeline still offloaded?
+    env.pageserver.stop()
+    env.pageserver.start()
+
+    assert timeline_offloaded_api(child_timeline_id)
+    assert not timeline_offloaded_api(root_timeline_id)
+
+    ps_http.timeline_archival_config(
+        tenant_id,
+        child_timeline_id,
+        state=TimelineArchivalState.UNARCHIVED,
+    )
+    child_detail = ps_http.timeline_detail(
+        tenant_id,
+        child_timeline_id,
+    )
+    assert child_detail["is_archived"] is False
+
+    with env.endpoints.create_start(
+        "test_archived_branch_persisted", tenant_id=tenant_id
+    ) as endpoint:
+        sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key < 500")
+        assert sum == sum_again
+
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix=f"tenants/{str(env.initial_tenant)}/tenant-manifest",
+    )
+
+    assert not timeline_offloaded_api(root_timeline_id)
+
+    ps_http.tenant_delete(tenant_id)
+
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix=f"tenants/{str(tenant_id)}/",
+    )

From 64949a37a91124957f30177d10a57a061d8fea02 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 22 Oct 2024 18:06:21 -0400
Subject: [PATCH 42/48] fix(pageserver): make delta split layer writer finish
 atomic (#9048)

similar to https://github.com/neondatabase/neon/pull/8841, we make the
delta layer writer atomic when finishing the layers.

## Summary of changes

* `put_value` not taking discard fn anymore
* `finish` decides what layers to keep

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 .../src/tenant/storage_layer/split_writer.rs  | 162 +++++++-----------
 pageserver/src/tenant/timeline/compaction.rs  |  34 +---
 2 files changed, 63 insertions(+), 133 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index 5bd9a47e2b..45ac0c6668 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -170,7 +170,7 @@ impl SplitImageLayerWriter {
                     }
                     Err(e) => {
                         // ImageLayerWriter::finish will clean up the temporary layer if anything goes wrong,
-                        // so we don't need to remove it by ourselves.
+                        // so we don't need to remove the layer we just failed to create by ourselves.
                         clean_up_layers(generated_layers);
                         return Err(e);
                     }
@@ -206,7 +206,7 @@ impl SplitImageLayerWriter {
 pub struct SplitDeltaLayerWriter {
     inner: Option<(Key, DeltaLayerWriter)>,
     target_layer_size: u64,
-    generated_layers: Vec<SplitWriterResult>,
+    generated_layer_writers: Vec<(DeltaLayerWriter, PersistentLayerKey)>,
     conf: &'static PageServerConf,
     timeline_id: TimelineId,
     tenant_shard_id: TenantShardId,
@@ -225,7 +225,7 @@ impl SplitDeltaLayerWriter {
         Ok(Self {
             target_layer_size,
             inner: None,
-            generated_layers: Vec::new(),
+            generated_layer_writers: Vec::new(),
             conf,
             timeline_id,
             tenant_shard_id,
@@ -234,20 +234,13 @@ impl SplitDeltaLayerWriter {
         })
     }
 
-    /// Put value into the layer writer. In the case the writer decides to produce a layer, and the discard fn returns true, no layer will be written in the end.
-    pub async fn put_value_with_discard_fn<D, F>(
+    pub async fn put_value(
         &mut self,
         key: Key,
         lsn: Lsn,
         val: Value,
-        tline: &Arc<Timeline>,
         ctx: &RequestContext,
-        discard: D,
-    ) -> anyhow::Result<()>
-    where
-        D: FnOnce(&PersistentLayerKey) -> F,
-        F: Future<Output = bool>,
-    {
+    ) -> anyhow::Result<()> {
         // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
         // number, and therefore the final layer size could be a little bit larger or smaller than the target.
         //
@@ -291,24 +284,8 @@ impl SplitDeltaLayerWriter {
                     lsn_range: self.lsn_range.clone(),
                     is_delta: true,
                 };
-                if discard(&layer_key).await {
-                    drop(prev_delta_writer);
-                    self.generated_layers
-                        .push(SplitWriterResult::Discarded(layer_key));
-                } else {
-                    // `finish` will remove the file if anything goes wrong, while we need to handle deleting temporary
-                    // files for `finish_creating`.
-                    let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
-                    let delta_layer = match Layer::finish_creating(self.conf, tline, desc, &path) {
-                        Ok(layer) => layer,
-                        Err(e) => {
-                            tokio::fs::remove_file(&path).await.ok();
-                            return Err(e);
-                        }
-                    };
-                    self.generated_layers
-                        .push(SplitWriterResult::Produced(delta_layer));
-                }
+                self.generated_layer_writers
+                    .push((prev_delta_writer, layer_key));
             } else if inner.estimated_size() >= S3_UPLOAD_LIMIT {
                 // We have to produce a very large file b/c a key is updated too often.
                 anyhow::bail!(
@@ -323,60 +300,68 @@ impl SplitDeltaLayerWriter {
         inner.put_value(key, lsn, val, ctx).await
     }
 
-    pub async fn put_value(
-        &mut self,
-        key: Key,
-        lsn: Lsn,
-        val: Value,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        self.put_value_with_discard_fn(key, lsn, val, tline, ctx, |_| async { false })
-            .await
-    }
-
     pub(crate) async fn finish_with_discard_fn<D, F>(
         self,
         tline: &Arc<Timeline>,
         ctx: &RequestContext,
-        discard: D,
+        discard_fn: D,
     ) -> anyhow::Result<Vec<SplitWriterResult>>
     where
-        D: FnOnce(&PersistentLayerKey) -> F,
+        D: Fn(&PersistentLayerKey) -> F,
         F: Future<Output = bool>,
     {
         let Self {
-            mut generated_layers,
+            mut generated_layer_writers,
             inner,
             ..
         } = self;
-        let Some((start_key, inner)) = inner else {
-            return Ok(generated_layers);
-        };
-        if inner.num_keys() == 0 {
-            return Ok(generated_layers);
+        if let Some((start_key, writer)) = inner {
+            if writer.num_keys() != 0 {
+                let end_key = self.last_key_written.next();
+                let layer_key = PersistentLayerKey {
+                    key_range: start_key..end_key,
+                    lsn_range: self.lsn_range.clone(),
+                    is_delta: true,
+                };
+                generated_layer_writers.push((writer, layer_key));
+            }
         }
-        let end_key = self.last_key_written.next();
-        let layer_key = PersistentLayerKey {
-            key_range: start_key..end_key,
-            lsn_range: self.lsn_range.clone(),
-            is_delta: true,
-        };
-        if discard(&layer_key).await {
-            generated_layers.push(SplitWriterResult::Discarded(layer_key));
-        } else {
-            // `finish` will remove the file if anything goes wrong, while we need to handle deleting temporary
-            // files for `finish_creating`.
-            let (desc, path) = inner.finish(end_key, ctx).await?;
-            let delta_layer = match Layer::finish_creating(self.conf, tline, desc, &path) {
-                Ok(layer) => layer,
-                Err(e) => {
-                    tokio::fs::remove_file(&path).await.ok();
-                    return Err(e);
+        let clean_up_layers = |generated_layers: Vec<SplitWriterResult>| {
+            for produced_layer in generated_layers {
+                if let SplitWriterResult::Produced(delta_layer) = produced_layer {
+                    let layer: Layer = delta_layer.into();
+                    layer.delete_on_drop();
                 }
-            };
-            generated_layers.push(SplitWriterResult::Produced(delta_layer));
+            }
+        };
+        // BEGIN: catch every error and do the recovery in the below section
+        let mut generated_layers = Vec::new();
+        for (inner, layer_key) in generated_layer_writers {
+            if discard_fn(&layer_key).await {
+                generated_layers.push(SplitWriterResult::Discarded(layer_key));
+            } else {
+                let layer = match inner.finish(layer_key.key_range.end, ctx).await {
+                    Ok((desc, path)) => {
+                        match Layer::finish_creating(self.conf, tline, desc, &path) {
+                            Ok(layer) => layer,
+                            Err(e) => {
+                                tokio::fs::remove_file(&path).await.ok();
+                                clean_up_layers(generated_layers);
+                                return Err(e);
+                            }
+                        }
+                    }
+                    Err(e) => {
+                        // DeltaLayerWriter::finish will clean up the temporary layer if anything goes wrong,
+                        // so we don't need to remove the layer we just failed to create by ourselves.
+                        clean_up_layers(generated_layers);
+                        return Err(e);
+                    }
+                };
+                generated_layers.push(SplitWriterResult::Produced(layer));
+            }
         }
+        // END: catch every error and do the recovery in the above section
         Ok(generated_layers)
     }
 
@@ -389,11 +374,6 @@ impl SplitDeltaLayerWriter {
         self.finish_with_discard_fn(tline, ctx, |_| async { false })
             .await
     }
-
-    /// This function will be deprecated with #8841.
-    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, Option<DeltaLayerWriter>)> {
-        Ok((self.generated_layers, self.inner.map(|x| x.1)))
-    }
 }
 
 #[cfg(test)]
@@ -473,13 +453,7 @@ mod tests {
         assert_eq!(layers.len(), 1);
 
         delta_writer
-            .put_value(
-                get_key(0),
-                Lsn(0x18),
-                Value::Image(get_img(0)),
-                &tline,
-                &ctx,
-            )
+            .put_value(get_key(0), Lsn(0x18), Value::Image(get_img(0)), &ctx)
             .await
             .unwrap();
         let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
@@ -551,14 +525,7 @@ mod tests {
                 .await
                 .unwrap();
             delta_writer
-                .put_value_with_discard_fn(
-                    get_key(i),
-                    Lsn(0x20),
-                    Value::Image(get_large_img()),
-                    &tline,
-                    &ctx,
-                    |_| async { discard },
-                )
+                .put_value(get_key(i), Lsn(0x20), Value::Image(get_large_img()), &ctx)
                 .await
                 .unwrap();
         }
@@ -664,23 +631,11 @@ mod tests {
         assert_eq!(layers.len(), 2);
 
         delta_writer
-            .put_value(
-                get_key(0),
-                Lsn(0x18),
-                Value::Image(get_img(0)),
-                &tline,
-                &ctx,
-            )
+            .put_value(get_key(0), Lsn(0x18), Value::Image(get_img(0)), &ctx)
             .await
             .unwrap();
         delta_writer
-            .put_value(
-                get_key(1),
-                Lsn(0x1A),
-                Value::Image(get_large_img()),
-                &tline,
-                &ctx,
-            )
+            .put_value(get_key(1), Lsn(0x1A), Value::Image(get_large_img()), &ctx)
             .await
             .unwrap();
         let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
@@ -744,7 +699,6 @@ mod tests {
                     get_key(0),
                     Lsn(i as u64 * 16 + 0x10),
                     Value::Image(get_large_img()),
-                    &tline,
                     &ctx,
                 )
                 .await
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 5cb1460b29..37d907ddcb 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -121,18 +121,12 @@ impl KeyHistoryRetention {
     async fn pipe_to(
         self,
         key: Key,
-        tline: &Arc<Timeline>,
         delta_writer: &mut SplitDeltaLayerWriter,
         mut image_writer: Option<&mut SplitImageLayerWriter>,
         stat: &mut CompactionStatistics,
-        dry_run: bool,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let mut first_batch = true;
-        let discard = |key: &PersistentLayerKey| {
-            let key = key.clone();
-            async move { Self::discard_key(&key, tline, dry_run).await }
-        };
         for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon {
             if first_batch {
                 if logs.len() == 1 && logs[0].1.is_image() {
@@ -144,40 +138,27 @@ impl KeyHistoryRetention {
                         image_writer.put_image(key, img.clone(), ctx).await?;
                     } else {
                         delta_writer
-                            .put_value_with_discard_fn(
-                                key,
-                                cutoff_lsn,
-                                Value::Image(img.clone()),
-                                tline,
-                                ctx,
-                                discard,
-                            )
+                            .put_value(key, cutoff_lsn, Value::Image(img.clone()), ctx)
                             .await?;
                     }
                 } else {
                     for (lsn, val) in logs {
                         stat.produce_key(&val);
-                        delta_writer
-                            .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
-                            .await?;
+                        delta_writer.put_value(key, lsn, val, ctx).await?;
                     }
                 }
                 first_batch = false;
             } else {
                 for (lsn, val) in logs {
                     stat.produce_key(&val);
-                    delta_writer
-                        .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
-                        .await?;
+                    delta_writer.put_value(key, lsn, val, ctx).await?;
                 }
             }
         }
         let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
         for (lsn, val) in above_horizon_logs {
             stat.produce_key(&val);
-            delta_writer
-                .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
-                .await?;
+            delta_writer.put_value(key, lsn, val, ctx).await?;
         }
         Ok(())
     }
@@ -1988,11 +1969,9 @@ impl Timeline {
                 retention
                     .pipe_to(
                         *last_key,
-                        self,
                         &mut delta_layer_writer,
                         image_layer_writer.as_mut(),
                         &mut stat,
-                        dry_run,
                         ctx,
                     )
                     .await?;
@@ -2019,11 +1998,9 @@ impl Timeline {
         retention
             .pipe_to(
                 last_key,
-                self,
                 &mut delta_layer_writer,
                 image_layer_writer.as_mut(),
                 &mut stat,
-                dry_run,
                 ctx,
             )
             .await?;
@@ -2051,8 +2028,7 @@ impl Timeline {
                 .finish_with_discard_fn(self, ctx, discard)
                 .await?
         } else {
-            let (layers, _) = delta_layer_writer.take()?;
-            assert!(layers.is_empty(), "delta layers produced in dry run mode?");
+            drop(delta_layer_writer);
             Vec::new()
         };
 

From 3a3bd34a28e0137513e7e31a6b808cf9566a14c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 23 Oct 2024 00:34:24 +0200
Subject: [PATCH 43/48] Rename IndexPart::{from_s3_bytes,to_s3_bytes} (#9481)

We support multiple storage backends now, so remove the `_s3_` from the
name.

Analogous to the names adopted for tenant manifests added in #9444.
---
 pageserver/ctl/src/index_part.rs              |  2 +-
 .../tenant/remote_timeline_client/index.rs    | 24 +++++++++----------
 .../tenant/remote_timeline_client/upload.rs   |  2 +-
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs
index 20018846f8..6cce2844c7 100644
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -11,7 +11,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
     match cmd {
         IndexPartCmd::Dump { path } => {
             let bytes = tokio::fs::read(path).await.context("read file")?;
-            let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
+            let des: IndexPart = IndexPart::from_json_bytes(&bytes).context("deserialize")?;
             let output = serde_json::to_string_pretty(&des).context("serialize output")?;
             println!("{output}");
             Ok(())
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 3a74a4ed11..d8a881a2c4 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -121,11 +121,11 @@ impl IndexPart {
         self.disk_consistent_lsn
     }
 
-    pub fn from_s3_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
+    pub fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
         serde_json::from_slice::<IndexPart>(bytes)
     }
 
-    pub fn to_s3_bytes(&self) -> serde_json::Result<Vec<u8>> {
+    pub fn to_json_bytes(&self) -> serde_json::Result<Vec<u8>> {
         serde_json::to_vec(self)
     }
 
@@ -383,7 +383,7 @@ mod tests {
             last_aux_file_policy: None,
         };
 
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
         assert_eq!(part, expected);
     }
 
@@ -427,7 +427,7 @@ mod tests {
             last_aux_file_policy: None,
         };
 
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
         assert_eq!(part, expected);
     }
 
@@ -472,7 +472,7 @@ mod tests {
             last_aux_file_policy: None,
         };
 
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
         assert_eq!(part, expected);
     }
 
@@ -520,7 +520,7 @@ mod tests {
             last_aux_file_policy: None,
         };
 
-        let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();
+        let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap();
 
         assert_eq!(empty_layers_parsed, expected);
     }
@@ -563,7 +563,7 @@ mod tests {
             last_aux_file_policy: None,
         };
 
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
         assert_eq!(part, expected);
     }
 
@@ -609,7 +609,7 @@ mod tests {
             last_aux_file_policy: None,
         };
 
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
         assert_eq!(part, expected);
     }
 
@@ -660,7 +660,7 @@ mod tests {
             last_aux_file_policy: Some(AuxFilePolicy::V2),
         };
 
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
         assert_eq!(part, expected);
     }
 
@@ -716,7 +716,7 @@ mod tests {
             last_aux_file_policy: Default::default(),
         };
 
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
         assert_eq!(part, expected);
     }
 
@@ -773,7 +773,7 @@ mod tests {
             last_aux_file_policy: Default::default(),
         };
 
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
         assert_eq!(part, expected);
     }
 
@@ -835,7 +835,7 @@ mod tests {
             archived_at: None,
         };
 
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
         assert_eq!(part, expected);
     }
 
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index 5a2b7bd08f..0cd5d05aa2 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -41,7 +41,7 @@ pub(crate) async fn upload_index_part<'a>(
     pausable_failpoint!("before-upload-index-pausable");
 
     // FIXME: this error comes too late
-    let serialized = index_part.to_s3_bytes()?;
+    let serialized = index_part.to_json_bytes()?;
     let serialized = Bytes::from(serialized);
 
     let index_part_size = serialized.len();

From 92d5e0e87a8d397f86cb7c8dc0fddb318b2da46b Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 23 Oct 2024 08:21:28 +0200
Subject: [PATCH 44/48] proxy: clear lib.rs of code items (#9479)

We keep lib.rs for crate configs, lint configs and re-exports for the binaries.
---
 proxy/src/auth/backend/jwt.rs            |   4 +-
 proxy/src/auth/backend/local.rs          |   3 +-
 proxy/src/auth/backend/mod.rs            |   5 +-
 proxy/src/auth/credentials.rs            |   2 +-
 proxy/src/auth/password_hack.rs          |   2 +-
 proxy/src/bin/local_proxy.rs             |   6 +-
 proxy/src/bin/pg_sni_router.rs           |   6 +-
 proxy/src/bin/proxy.rs                   |   6 +-
 proxy/src/cache/endpoints.rs             |   2 +-
 proxy/src/cache/project_info.rs          |   4 +-
 proxy/src/compute.rs                     |   2 +-
 proxy/src/compute_ctl/mod.rs             |   3 +-
 proxy/src/config.rs                      |   2 +-
 proxy/src/context/mod.rs                 |   2 +-
 proxy/src/control_plane/provider/mock.rs |   3 +-
 proxy/src/control_plane/provider/mod.rs  |   3 +-
 proxy/src/control_plane/provider/neon.rs |   3 +-
 proxy/src/error.rs                       |   7 +
 proxy/src/intern.rs                      |   2 +-
 proxy/src/lib.rs                         | 168 +----------------------
 proxy/src/proxy/connect_compute.rs       |   2 +-
 proxy/src/proxy/mod.rs                   |   3 +-
 proxy/src/proxy/tests/mod.rs             |   3 +-
 proxy/src/rate_limiter/limiter.rs        |   2 +-
 proxy/src/redis/notifications.rs         |   2 +-
 proxy/src/scram/mod.rs                   |   2 +-
 proxy/src/scram/threadpool.rs            |   2 +-
 proxy/src/serverless/backend.rs          |   3 +-
 proxy/src/serverless/conn_pool.rs        |   2 +-
 proxy/src/serverless/conn_pool_lib.rs    |   2 +-
 proxy/src/serverless/http_conn_pool.rs   |   2 +-
 proxy/src/serverless/local_conn_pool.rs  |   2 +-
 proxy/src/serverless/sql_over_http.rs    |   2 +-
 proxy/src/signals.rs                     |  39 ++++++
 proxy/src/types.rs                       | 122 ++++++++++++++++
 proxy/src/usage_metrics.rs               |   3 +-
 36 files changed, 221 insertions(+), 207 deletions(-)
 create mode 100644 proxy/src/signals.rs
 create mode 100644 proxy/src/types.rs

diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 3f53ee24c3..2185677159 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -16,7 +16,7 @@ use crate::context::RequestMonitoring;
 use crate::control_plane::errors::GetEndpointJwksError;
 use crate::http::parse_json_body_with_limit;
 use crate::intern::RoleNameInt;
-use crate::{EndpointId, RoleName};
+use crate::types::{EndpointId, RoleName};
 
 // TODO(conrad): make these configurable.
 const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30);
@@ -669,7 +669,7 @@ mod tests {
     use tokio::net::TcpListener;
 
     use super::*;
-    use crate::RoleName;
+    use crate::types::RoleName;
 
     fn new_ec_jwk(kid: String) -> (p256::SecretKey, jose_jwk::Jwk) {
         let sk = p256::SecretKey::random(&mut OsRng);
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
index 1e029ff609..f9cb085daf 100644
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -10,9 +10,10 @@ use crate::compute_ctl::ComputeCtlApi;
 use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo};
 use crate::control_plane::NodeInfo;
+use crate::http;
 use crate::intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag};
+use crate::types::EndpointId;
 use crate::url::ApiUrl;
-use crate::{http, EndpointId};
 
 pub struct LocalBackend {
     pub(crate) initialize: Semaphore,
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index a4db130b61..17334b9cbb 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -32,7 +32,8 @@ use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
 use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo};
 use crate::stream::Stream;
-use crate::{scram, stream, EndpointCacheKey, EndpointId, RoleName};
+use crate::types::{EndpointCacheKey, EndpointId, RoleName};
+use crate::{scram, stream};
 
 /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
 pub enum MaybeOwned<'a, T> {
@@ -551,7 +552,7 @@ mod tests {
         async fn get_endpoint_jwks(
             &self,
             _ctx: &RequestMonitoring,
-            _endpoint: crate::EndpointId,
+            _endpoint: crate::types::EndpointId,
         ) -> Result<Vec<super::jwt::AuthRule>, control_plane::errors::GetEndpointJwksError>
         {
             unimplemented!()
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 465e427f7c..ddecae6af5 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -15,7 +15,7 @@ use crate::error::{ReportableError, UserFacingError};
 use crate::metrics::{Metrics, SniKind};
 use crate::proxy::NeonOptions;
 use crate::serverless::SERVERLESS_DRIVER_SNI;
-use crate::{EndpointId, RoleName};
+use crate::types::{EndpointId, RoleName};
 
 #[derive(Debug, Error, PartialEq, Eq, Clone)]
 pub(crate) enum ComputeUserInfoParseError {
diff --git a/proxy/src/auth/password_hack.rs b/proxy/src/auth/password_hack.rs
index 8585b8ff48..b934c28a78 100644
--- a/proxy/src/auth/password_hack.rs
+++ b/proxy/src/auth/password_hack.rs
@@ -5,7 +5,7 @@
 
 use bstr::ByteSlice;
 
-use crate::EndpointId;
+use crate::types::EndpointId;
 
 pub(crate) struct PasswordHackPayload {
     pub(crate) endpoint: EndpointId,
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index a16c288e5d..df3628465f 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -25,8 +25,8 @@ use proxy::rate_limiter::{
 use proxy::scram::threadpool::ThreadPool;
 use proxy::serverless::cancel_set::CancelSet;
 use proxy::serverless::{self, GlobalConnPoolOptions};
+use proxy::types::RoleName;
 use proxy::url::ApiUrl;
-use proxy::RoleName;
 
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
@@ -177,7 +177,7 @@ async fn main() -> anyhow::Result<()> {
     let mut maintenance_tasks = JoinSet::new();
 
     let refresh_config_notify = Arc::new(Notify::new());
-    maintenance_tasks.spawn(proxy::handle_signals(shutdown.clone(), {
+    maintenance_tasks.spawn(proxy::signals::handle(shutdown.clone(), {
         let refresh_config_notify = Arc::clone(&refresh_config_notify);
         move || {
             refresh_config_notify.notify_one();
@@ -216,7 +216,7 @@ async fn main() -> anyhow::Result<()> {
 
     match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await {
         // exit immediately on maintenance task completion
-        Either::Left((Some(res), _)) => match proxy::flatten_err(res)? {},
+        Either::Left((Some(res), _)) => match proxy::error::flatten_err(res)? {},
         // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above)
         Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"),
         // exit immediately on client task error
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 13b7fdd40a..025053d3cb 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -133,14 +133,14 @@ async fn main() -> anyhow::Result<()> {
         proxy_listener,
         cancellation_token.clone(),
     ));
-    let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token, || {}));
+    let signals_task = tokio::spawn(proxy::signals::handle(cancellation_token, || {}));
 
     // the signal task cant ever succeed.
     // the main task can error, or can succeed on cancellation.
     // we want to immediately exit on either of these cases
     let signal = match futures::future::select(signals_task, main).await {
-        Either::Left((res, _)) => proxy::flatten_err(res)?,
-        Either::Right((res, _)) => return proxy::flatten_err(res),
+        Either::Left((res, _)) => proxy::error::flatten_err(res)?,
+        Either::Right((res, _)) => return proxy::error::flatten_err(res),
     };
 
     // maintenance tasks return `Infallible` success values, this is an impossible value
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 96a71e69c6..6e190029aa 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -495,7 +495,7 @@ async fn main() -> anyhow::Result<()> {
 
     // maintenance tasks. these never return unless there's an error
     let mut maintenance_tasks = JoinSet::new();
-    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone(), || {}));
+    maintenance_tasks.spawn(proxy::signals::handle(cancellation_token.clone(), || {}));
     maintenance_tasks.spawn(http::health_server::task_main(
         http_listener,
         AppMetrics {
@@ -561,11 +561,11 @@ async fn main() -> anyhow::Result<()> {
         .await
         {
             // exit immediately on maintenance task completion
-            Either::Left((Some(res), _)) => break proxy::flatten_err(res)?,
+            Either::Left((Some(res), _)) => break proxy::error::flatten_err(res)?,
             // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above)
             Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"),
             // exit immediately on client task error
-            Either::Right((Some(res), _)) => proxy::flatten_err(res)?,
+            Either::Right((Some(res), _)) => proxy::error::flatten_err(res)?,
             // exit if all our client tasks have shutdown gracefully
             Either::Right((None, _)) => return Ok(()),
         }
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 82f3247fa7..12c33169bf 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -17,7 +17,7 @@ use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
 use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
 use crate::rate_limiter::GlobalRateLimiter;
 use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
-use crate::EndpointId;
+use crate::types::EndpointId;
 
 #[derive(Deserialize, Debug, Clone)]
 pub(crate) struct ControlPlaneEventKey {
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index 31d1dc96e7..84430dc812 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -17,7 +17,7 @@ use crate::auth::IpPattern;
 use crate::config::ProjectInfoCacheOptions;
 use crate::control_plane::AuthSecret;
 use crate::intern::{EndpointIdInt, ProjectIdInt, RoleNameInt};
-use crate::{EndpointId, RoleName};
+use crate::types::{EndpointId, RoleName};
 
 #[async_trait]
 pub(crate) trait ProjectInfoCache {
@@ -368,7 +368,7 @@ impl Cache for ProjectInfoCacheImpl {
 mod tests {
     use super::*;
     use crate::scram::ServerSecret;
-    use crate::ProjectId;
+    use crate::types::ProjectId;
 
     #[tokio::test]
     async fn test_project_info_cache_settings() {
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index a7c2cab4a1..b97942ee5d 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -25,7 +25,7 @@ use crate::control_plane::provider::ApiLockError;
 use crate::error::{ReportableError, UserFacingError};
 use crate::metrics::{Metrics, NumDbConnectionsGuard};
 use crate::proxy::neon_option;
-use crate::Host;
+use crate::types::Host;
 
 pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
 
diff --git a/proxy/src/compute_ctl/mod.rs b/proxy/src/compute_ctl/mod.rs
index 2b57897223..60fdf107d4 100644
--- a/proxy/src/compute_ctl/mod.rs
+++ b/proxy/src/compute_ctl/mod.rs
@@ -4,8 +4,9 @@ use serde::de::DeserializeOwned;
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
 
+use crate::http;
+use crate::types::{DbName, RoleName};
 use crate::url::ApiUrl;
-use crate::{http, DbName, RoleName};
 
 pub struct ComputeCtlApi {
     pub(crate) api: http::Endpoint,
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 3baa7ec751..5183f22fa3 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -20,7 +20,7 @@ use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig}
 use crate::scram::threadpool::ThreadPool;
 use crate::serverless::cancel_set::CancelSet;
 use crate::serverless::GlobalConnPoolOptions;
-use crate::Host;
+use crate::types::Host;
 
 pub struct ProxyConfig {
     pub tls_config: Option<TlsConfig>,
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index e2d2c1b766..ca3b808a1b 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -19,7 +19,7 @@ use crate::intern::{BranchIdInt, ProjectIdInt};
 use crate::metrics::{
     ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting,
 };
-use crate::{DbName, EndpointId, RoleName};
+use crate::types::{DbName, EndpointId, RoleName};
 
 pub mod parquet;
 
diff --git a/proxy/src/control_plane/provider/mock.rs b/proxy/src/control_plane/provider/mock.rs
index fb061376e7..75a242d8d3 100644
--- a/proxy/src/control_plane/provider/mock.rs
+++ b/proxy/src/control_plane/provider/mock.rs
@@ -21,8 +21,9 @@ use crate::control_plane::messages::MetricsAuxInfo;
 use crate::control_plane::provider::{CachedAllowedIps, CachedRoleSecret};
 use crate::error::io_error;
 use crate::intern::RoleNameInt;
+use crate::types::{BranchId, EndpointId, ProjectId, RoleName};
 use crate::url::ApiUrl;
-use crate::{compute, scram, BranchId, EndpointId, ProjectId, RoleName};
+use crate::{compute, scram};
 
 #[derive(Debug, Error)]
 enum MockApiError {
diff --git a/proxy/src/control_plane/provider/mod.rs b/proxy/src/control_plane/provider/mod.rs
index 88399dffa8..49e57b6b7e 100644
--- a/proxy/src/control_plane/provider/mod.rs
+++ b/proxy/src/control_plane/provider/mod.rs
@@ -23,7 +23,8 @@ use crate::error::ReportableError;
 use crate::intern::ProjectIdInt;
 use crate::metrics::ApiLockMetrics;
 use crate::rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token};
-use crate::{compute, scram, EndpointCacheKey, EndpointId};
+use crate::types::{EndpointCacheKey, EndpointId};
+use crate::{compute, scram};
 
 pub(crate) mod errors {
     use thiserror::Error;
diff --git a/proxy/src/control_plane/provider/neon.rs b/proxy/src/control_plane/provider/neon.rs
index 5d0692c7ca..8ea91d7875 100644
--- a/proxy/src/control_plane/provider/neon.rs
+++ b/proxy/src/control_plane/provider/neon.rs
@@ -24,7 +24,8 @@ use crate::control_plane::errors::GetEndpointJwksError;
 use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
 use crate::metrics::{CacheOutcome, Metrics};
 use crate::rate_limiter::WakeComputeRateLimiter;
-use crate::{compute, http, scram, EndpointCacheKey, EndpointId};
+use crate::types::{EndpointCacheKey, EndpointId};
+use crate::{compute, http, scram};
 
 const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
 
diff --git a/proxy/src/error.rs b/proxy/src/error.rs
index e71ed0c048..7b693a7418 100644
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -1,7 +1,9 @@
 use std::error::Error as StdError;
 use std::{fmt, io};
 
+use anyhow::Context;
 use measured::FixedCardinalityLabel;
+use tokio::task::JoinError;
 
 /// Upcast (almost) any error into an opaque [`io::Error`].
 pub(crate) fn io_error(e: impl Into<Box<dyn StdError + Send + Sync>>) -> io::Error {
@@ -97,3 +99,8 @@ impl ReportableError for tokio_postgres::error::Error {
         }
     }
 }
+
+/// Flattens `Result<Result<T>>` into `Result<T>`.
+pub fn flatten_err<T>(r: Result<anyhow::Result<T>, JoinError>) -> anyhow::Result<T> {
+    r.context("join error").and_then(|x| x)
+}
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index 49aab917e4..f56d92a6b3 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -7,7 +7,7 @@ use std::sync::OnceLock;
 use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo};
 use rustc_hash::FxHasher;
 
-use crate::{BranchId, EndpointId, ProjectId, RoleName};
+use crate::types::{BranchId, EndpointId, ProjectId, RoleName};
 
 pub trait InternId: Sized + 'static {
     fn get_interner() -> &'static StringInterner<Self>;
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index ea17a88067..f95d645c23 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -78,14 +78,6 @@
 // List of temporarily allowed lints to unblock beta/nightly.
 #![allow(unknown_lints)]
 
-use std::convert::Infallible;
-
-use anyhow::{bail, Context};
-use intern::{EndpointIdInt, EndpointIdTag, InternId};
-use tokio::task::JoinError;
-use tokio_util::sync::CancellationToken;
-use tracing::warn;
-
 pub mod auth;
 pub mod cache;
 pub mod cancellation;
@@ -109,165 +101,9 @@ pub mod redis;
 pub mod sasl;
 pub mod scram;
 pub mod serverless;
+pub mod signals;
 pub mod stream;
+pub mod types;
 pub mod url;
 pub mod usage_metrics;
 pub mod waiters;
-
-/// Handle unix signals appropriately.
-pub async fn handle_signals<F>(
-    token: CancellationToken,
-    mut refresh_config: F,
-) -> anyhow::Result<Infallible>
-where
-    F: FnMut(),
-{
-    use tokio::signal::unix::{signal, SignalKind};
-
-    let mut hangup = signal(SignalKind::hangup())?;
-    let mut interrupt = signal(SignalKind::interrupt())?;
-    let mut terminate = signal(SignalKind::terminate())?;
-
-    loop {
-        tokio::select! {
-            // Hangup is commonly used for config reload.
-            _ = hangup.recv() => {
-                warn!("received SIGHUP");
-                refresh_config();
-            }
-            // Shut down the whole application.
-            _ = interrupt.recv() => {
-                warn!("received SIGINT, exiting immediately");
-                bail!("interrupted");
-            }
-            _ = terminate.recv() => {
-                warn!("received SIGTERM, shutting down once all existing connections have closed");
-                token.cancel();
-            }
-        }
-    }
-}
-
-/// Flattens `Result<Result<T>>` into `Result<T>`.
-pub fn flatten_err<T>(r: Result<anyhow::Result<T>, JoinError>) -> anyhow::Result<T> {
-    r.context("join error").and_then(|x| x)
-}
-
-macro_rules! smol_str_wrapper {
-    ($name:ident) => {
-        #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
-        pub struct $name(smol_str::SmolStr);
-
-        impl $name {
-            #[allow(unused)]
-            pub(crate) fn as_str(&self) -> &str {
-                self.0.as_str()
-            }
-        }
-
-        impl std::fmt::Display for $name {
-            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-                self.0.fmt(f)
-            }
-        }
-
-        impl<T> std::cmp::PartialEq<T> for $name
-        where
-            smol_str::SmolStr: std::cmp::PartialEq<T>,
-        {
-            fn eq(&self, other: &T) -> bool {
-                self.0.eq(other)
-            }
-        }
-
-        impl<T> From<T> for $name
-        where
-            smol_str::SmolStr: From<T>,
-        {
-            fn from(x: T) -> Self {
-                Self(x.into())
-            }
-        }
-
-        impl AsRef<str> for $name {
-            fn as_ref(&self) -> &str {
-                self.0.as_ref()
-            }
-        }
-
-        impl std::ops::Deref for $name {
-            type Target = str;
-            fn deref(&self) -> &str {
-                &*self.0
-            }
-        }
-
-        impl<'de> serde::de::Deserialize<'de> for $name {
-            fn deserialize<D: serde::de::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
-                <smol_str::SmolStr as serde::de::Deserialize<'de>>::deserialize(d).map(Self)
-            }
-        }
-
-        impl serde::Serialize for $name {
-            fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
-                self.0.serialize(s)
-            }
-        }
-    };
-}
-
-const POOLER_SUFFIX: &str = "-pooler";
-
-impl EndpointId {
-    fn normalize(&self) -> Self {
-        if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
-            stripped.into()
-        } else {
-            self.clone()
-        }
-    }
-
-    fn normalize_intern(&self) -> EndpointIdInt {
-        if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
-            EndpointIdTag::get_interner().get_or_intern(stripped)
-        } else {
-            self.into()
-        }
-    }
-}
-
-// 90% of role name strings are 20 characters or less.
-smol_str_wrapper!(RoleName);
-// 50% of endpoint strings are 23 characters or less.
-smol_str_wrapper!(EndpointId);
-// 50% of branch strings are 23 characters or less.
-smol_str_wrapper!(BranchId);
-// 90% of project strings are 23 characters or less.
-smol_str_wrapper!(ProjectId);
-
-// will usually equal endpoint ID
-smol_str_wrapper!(EndpointCacheKey);
-
-smol_str_wrapper!(DbName);
-
-// postgres hostname, will likely be a port:ip addr
-smol_str_wrapper!(Host);
-
-// Endpoints are a bit tricky. Rare they might be branches or projects.
-impl EndpointId {
-    pub(crate) fn is_endpoint(&self) -> bool {
-        self.0.starts_with("ep-")
-    }
-    pub(crate) fn is_branch(&self) -> bool {
-        self.0.starts_with("br-")
-    }
-    // pub(crate) fn is_project(&self) -> bool {
-    //     !self.is_endpoint() && !self.is_branch()
-    // }
-    pub(crate) fn as_branch(&self) -> BranchId {
-        BranchId(self.0.clone())
-    }
-    pub(crate) fn as_project(&self) -> ProjectId {
-        ProjectId(self.0.clone())
-    }
-}
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 8e9663626a..659b7afa68 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -17,7 +17,7 @@ use crate::metrics::{
 };
 use crate::proxy::retry::{retry_after, should_retry, CouldRetry};
 use crate::proxy::wake_compute::wake_compute;
-use crate::Host;
+use crate::types::Host;
 
 const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
 
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index f646862caa..2970d93393 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -32,7 +32,8 @@ use crate::protocol2::read_proxy_protocol;
 use crate::proxy::handshake::{handshake, HandshakeData};
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::stream::{PqStream, Stream};
-use crate::{auth, compute, EndpointCacheKey};
+use crate::types::EndpointCacheKey;
+use crate::{auth, compute};
 
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 3f54b0661b..fe62fee204 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -28,7 +28,8 @@ use crate::control_plane::provider::{
 };
 use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
-use crate::{sasl, scram, BranchId, EndpointId, ProjectId};
+use crate::types::{BranchId, EndpointId, ProjectId};
+use crate::{sasl, scram};
 
 /// Generate a set of TLS certificates: CA + server.
 fn generate_certs(
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 5de64c2254..4259fd04f4 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -250,7 +250,7 @@ mod tests {
     use super::{BucketRateLimiter, WakeComputeRateLimiter};
     use crate::intern::EndpointIdInt;
     use crate::rate_limiter::RateBucketInfo;
-    use crate::EndpointId;
+    use crate::types::EndpointId;
 
     #[test]
     fn rate_bucket_rpi() {
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index e56c5a3414..62e7b1b565 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -271,7 +271,7 @@ mod tests {
     use serde_json::json;
 
     use super::*;
-    use crate::{ProjectId, RoleName};
+    use crate::types::{ProjectId, RoleName};
 
     #[test]
     fn parse_allowed_ips() -> anyhow::Result<()> {
diff --git a/proxy/src/scram/mod.rs b/proxy/src/scram/mod.rs
index 97644b6282..718445f61d 100644
--- a/proxy/src/scram/mod.rs
+++ b/proxy/src/scram/mod.rs
@@ -62,7 +62,7 @@ mod tests {
     use super::{Exchange, ServerSecret};
     use crate::intern::EndpointIdInt;
     use crate::sasl::{Mechanism, Step};
-    use crate::EndpointId;
+    use crate::types::EndpointId;
 
     #[test]
     fn snapshot() {
diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs
index cc1b69fcf9..ebc6dd2a3c 100644
--- a/proxy/src/scram/threadpool.rs
+++ b/proxy/src/scram/threadpool.rs
@@ -189,7 +189,7 @@ impl Drop for JobHandle {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::EndpointId;
+    use crate::types::EndpointId;
 
     #[tokio::test]
     async fn hash_is_correct() {
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 5d59b4d252..07e0e30148 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -18,6 +18,7 @@ use super::local_conn_pool::{self, LocalClient, LocalConnPool, EXT_NAME, EXT_SCH
 use crate::auth::backend::local::StaticAuthRules;
 use crate::auth::backend::{ComputeCredentials, ComputeUserInfo};
 use crate::auth::{self, check_peer_addr_is_in_list, AuthError};
+use crate::compute;
 use crate::compute_ctl::{
     ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest,
 };
@@ -32,7 +33,7 @@ use crate::intern::EndpointIdInt;
 use crate::proxy::connect_compute::ConnectMechanism;
 use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute};
 use crate::rate_limiter::EndpointRateLimiter;
-use crate::{compute, EndpointId, Host};
+use crate::types::{EndpointId, Host};
 
 pub(crate) struct PoolingBackend {
     pub(crate) http_conn_pool: Arc<super::http_conn_pool::GlobalConnPool<Send>>,
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 8401e3a1c9..7fa3357b5b 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -211,7 +211,7 @@ mod tests {
     use super::*;
     use crate::proxy::NeonOptions;
     use crate::serverless::cancel_set::CancelSet;
-    use crate::{BranchId, EndpointId, ProjectId};
+    use crate::types::{BranchId, EndpointId, ProjectId};
 
     struct MockClient(Arc<AtomicBool>);
     impl MockClient {
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
index 844730194d..8830cddf0c 100644
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -16,8 +16,8 @@ use crate::auth::backend::ComputeUserInfo;
 use crate::context::RequestMonitoring;
 use crate::control_plane::messages::ColdStartInfo;
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
+use crate::types::{DbName, EndpointCacheKey, RoleName};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
-use crate::{DbName, EndpointCacheKey, RoleName};
 
 #[derive(Debug, Clone)]
 pub(crate) struct ConnInfo {
diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index 363e397976..934a50c14f 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -14,8 +14,8 @@ use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
 use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
+use crate::types::EndpointCacheKey;
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
-use crate::EndpointCacheKey;
 
 pub(crate) type Send = http2::SendRequest<hyper::body::Incoming>;
 pub(crate) type Connect =
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index e1ad46c751..064e7db7b3 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -35,8 +35,8 @@ use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
 use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::Metrics;
+use crate::types::{DbName, RoleName};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
-use crate::{DbName, RoleName};
 
 pub(crate) const EXT_NAME: &str = "pg_session_jwt";
 pub(crate) const EXT_VERSION: &str = "0.1.2";
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 6fbb044669..8e2d4c126a 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -38,8 +38,8 @@ use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::metrics::{HttpDirection, Metrics};
 use crate::proxy::{run_until_cancelled, NeonOptions};
 use crate::serverless::backend::HttpConnError;
+use crate::types::{DbName, RoleName};
 use crate::usage_metrics::{MetricCounter, MetricCounterRecorder};
-use crate::{DbName, RoleName};
 
 #[derive(serde::Deserialize)]
 #[serde(rename_all = "camelCase")]
diff --git a/proxy/src/signals.rs b/proxy/src/signals.rs
new file mode 100644
index 0000000000..514a83d5eb
--- /dev/null
+++ b/proxy/src/signals.rs
@@ -0,0 +1,39 @@
+use std::convert::Infallible;
+
+use anyhow::bail;
+use tokio_util::sync::CancellationToken;
+use tracing::warn;
+
+/// Handle unix signals appropriately.
+pub async fn handle<F>(
+    token: CancellationToken,
+    mut refresh_config: F,
+) -> anyhow::Result<Infallible>
+where
+    F: FnMut(),
+{
+    use tokio::signal::unix::{signal, SignalKind};
+
+    let mut hangup = signal(SignalKind::hangup())?;
+    let mut interrupt = signal(SignalKind::interrupt())?;
+    let mut terminate = signal(SignalKind::terminate())?;
+
+    loop {
+        tokio::select! {
+            // Hangup is commonly used for config reload.
+            _ = hangup.recv() => {
+                warn!("received SIGHUP");
+                refresh_config();
+            }
+            // Shut down the whole application.
+            _ = interrupt.recv() => {
+                warn!("received SIGINT, exiting immediately");
+                bail!("interrupted");
+            }
+            _ = terminate.recv() => {
+                warn!("received SIGTERM, shutting down once all existing connections have closed");
+                token.cancel();
+            }
+        }
+    }
+}
diff --git a/proxy/src/types.rs b/proxy/src/types.rs
new file mode 100644
index 0000000000..b0408a51d1
--- /dev/null
+++ b/proxy/src/types.rs
@@ -0,0 +1,122 @@
+use crate::intern::{EndpointIdInt, EndpointIdTag, InternId};
+
+macro_rules! smol_str_wrapper {
+    ($name:ident) => {
+        #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
+        pub struct $name(smol_str::SmolStr);
+
+        impl $name {
+            #[allow(unused)]
+            pub(crate) fn as_str(&self) -> &str {
+                self.0.as_str()
+            }
+        }
+
+        impl std::fmt::Display for $name {
+            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                self.0.fmt(f)
+            }
+        }
+
+        impl<T> std::cmp::PartialEq<T> for $name
+        where
+            smol_str::SmolStr: std::cmp::PartialEq<T>,
+        {
+            fn eq(&self, other: &T) -> bool {
+                self.0.eq(other)
+            }
+        }
+
+        impl<T> From<T> for $name
+        where
+            smol_str::SmolStr: From<T>,
+        {
+            fn from(x: T) -> Self {
+                Self(x.into())
+            }
+        }
+
+        impl AsRef<str> for $name {
+            fn as_ref(&self) -> &str {
+                self.0.as_ref()
+            }
+        }
+
+        impl std::ops::Deref for $name {
+            type Target = str;
+            fn deref(&self) -> &str {
+                &*self.0
+            }
+        }
+
+        impl<'de> serde::de::Deserialize<'de> for $name {
+            fn deserialize<D: serde::de::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
+                <smol_str::SmolStr as serde::de::Deserialize<'de>>::deserialize(d).map(Self)
+            }
+        }
+
+        impl serde::Serialize for $name {
+            fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
+                self.0.serialize(s)
+            }
+        }
+    };
+}
+
+const POOLER_SUFFIX: &str = "-pooler";
+
+impl EndpointId {
+    #[must_use]
+    pub fn normalize(&self) -> Self {
+        if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
+            stripped.into()
+        } else {
+            self.clone()
+        }
+    }
+
+    #[must_use]
+    pub fn normalize_intern(&self) -> EndpointIdInt {
+        if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
+            EndpointIdTag::get_interner().get_or_intern(stripped)
+        } else {
+            self.into()
+        }
+    }
+}
+
+// 90% of role name strings are 20 characters or less.
+smol_str_wrapper!(RoleName);
+// 50% of endpoint strings are 23 characters or less.
+smol_str_wrapper!(EndpointId);
+// 50% of branch strings are 23 characters or less.
+smol_str_wrapper!(BranchId);
+// 90% of project strings are 23 characters or less.
+smol_str_wrapper!(ProjectId);
+
+// will usually equal endpoint ID
+smol_str_wrapper!(EndpointCacheKey);
+
+smol_str_wrapper!(DbName);
+
+// postgres hostname, will likely be a port:ip addr
+smol_str_wrapper!(Host);
+
+// Endpoints are a bit tricky. Rare they might be branches or projects.
+impl EndpointId {
+    pub(crate) fn is_endpoint(&self) -> bool {
+        self.0.starts_with("ep-")
+    }
+    pub(crate) fn is_branch(&self) -> bool {
+        self.0.starts_with("br-")
+    }
+    // pub(crate) fn is_project(&self) -> bool {
+    //     !self.is_endpoint() && !self.is_branch()
+    // }
+    pub(crate) fn as_branch(&self) -> BranchId {
+        BranchId(self.0.clone())
+    }
+    pub(crate) fn as_project(&self) -> ProjectId {
+        ProjectId(self.0.clone())
+    }
+}
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index f944d5aec3..c5e8588623 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -497,7 +497,8 @@ mod tests {
     use url::Url;
 
     use super::*;
-    use crate::{http, BranchId, EndpointId};
+    use crate::http;
+    use crate::types::{BranchId, EndpointId};
 
     #[tokio::test]
     async fn metrics() {

From 0595320c87c4bcab9e346cf7904f7e4b00454388 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 23 Oct 2024 09:55:00 -0600
Subject: [PATCH 45/48] Protect call to pg_current_wal_lsn() in retained_wal
 query

We can't call pg_current_wal_lsn() if we are a standby instance (read
replica). Any attempt to call this function while in recovery results
in:

ERROR:  recovery is in progress

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/etc/sql_exporter/retained_wal.sql | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/compute/etc/sql_exporter/retained_wal.sql b/compute/etc/sql_exporter/retained_wal.sql
index 6c58359461..3e2aadfc28 100644
--- a/compute/etc/sql_exporter/retained_wal.sql
+++ b/compute/etc/sql_exporter/retained_wal.sql
@@ -1,5 +1,10 @@
 SELECT
   slot_name,
-  pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
+  pg_wal_lsn_diff(
+    CASE
+      WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn()
+      ELSE pg_current_wal_lsn()
+    END,
+    restart_lsn)::FLOAT8 AS retained_wal
 FROM pg_replication_slots
 WHERE active = false;

From e3ff87ce3bfa99662a9b2d299b78be1bfa35b8cd Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 23 Oct 2024 17:29:55 +0100
Subject: [PATCH 46/48] tests: avoid using background_process when invoking
 pg_ctl (#9469)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

Occasionally, we get failures to start the storage controller's db with
errors like:
```
aborting due to panic at /__w/neon/neon/control_plane/src/background_process.rs:349:67:
claim pid file: lock file

Caused by:
    file is already locked
```
e.g.
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9428/11380574562/index.html#/testresult/1c68d413ea9ecd4a

This is happening in a stop,start cycle during a test. Presumably the
pidfile from the startup background process is still held at the point
we stop, because we let pg_ctl keep running in the background.

## Summary of changes

- Refactor pg_ctl invocations into a helper
- In the controller's `start` function, use pg_ctl & a wait loop for
pg_isready, instead of using background_process

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 control_plane/src/storage_controller.rs | 102 +++++++++++++++---------
 1 file changed, 65 insertions(+), 37 deletions(-)

diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 43c63e7ef4..b70bd2e1b5 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -20,7 +20,16 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use reqwest::Method;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{fs, net::SocketAddr, path::PathBuf, str::FromStr, sync::OnceLock};
+use std::{
+    ffi::OsStr,
+    fs,
+    net::SocketAddr,
+    path::PathBuf,
+    process::ExitStatus,
+    str::FromStr,
+    sync::OnceLock,
+    time::{Duration, Instant},
+};
 use tokio::process::Command;
 use tracing::instrument;
 use url::Url;
@@ -168,16 +177,6 @@ impl StorageController {
         .expect("non-Unicode path")
     }
 
-    /// PIDFile for the postgres instance used to store storage controller state
-    fn postgres_pid_file(&self) -> Utf8PathBuf {
-        Utf8PathBuf::from_path_buf(
-            self.env
-                .base_data_dir
-                .join("storage_controller_postgres.pid"),
-        )
-        .expect("non-Unicode path")
-    }
-
     /// Find the directory containing postgres subdirectories, such `bin` and `lib`
     ///
     /// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
@@ -296,6 +295,31 @@ impl StorageController {
             .map_err(anyhow::Error::new)
     }
 
+    /// Wrapper for the pg_ctl binary, which we spawn as a short-lived subprocess when starting and stopping postgres
+    async fn pg_ctl<I, S>(&self, args: I) -> ExitStatus
+    where
+        I: IntoIterator<Item = S>,
+        S: AsRef<OsStr>,
+    {
+        let pg_bin_dir = self.get_pg_bin_dir().await.unwrap();
+        let bin_path = pg_bin_dir.join("pg_ctl");
+
+        let pg_lib_dir = self.get_pg_lib_dir().await.unwrap();
+        let envs = [
+            ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+            ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+        ];
+
+        Command::new(bin_path)
+            .args(args)
+            .envs(envs)
+            .spawn()
+            .expect("Failed to spawn pg_ctl, binary_missing?")
+            .wait()
+            .await
+            .expect("Failed to wait for pg_ctl termination")
+    }
+
     pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> {
         let instance_dir = self.storage_controller_instance_dir(start_args.instance_id);
         if let Err(err) = tokio::fs::create_dir(&instance_dir).await {
@@ -404,20 +428,34 @@ impl StorageController {
                 db_start_args
             );
 
-            background_process::start_process(
-                "storage_controller_db",
-                &self.env.base_data_dir,
-                pg_bin_dir.join("pg_ctl").as_std_path(),
-                db_start_args,
-                vec![
-                    ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                    ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                ],
-                background_process::InitialPidFile::Create(self.postgres_pid_file()),
-                &start_args.start_timeout,
-                || self.pg_isready(&pg_bin_dir, postgres_port),
-            )
-            .await?;
+            let db_start_status = self.pg_ctl(db_start_args).await;
+            let start_timeout: Duration = start_args.start_timeout.into();
+            let db_start_deadline = Instant::now() + start_timeout;
+            if !db_start_status.success() {
+                return Err(anyhow::anyhow!(
+                    "Failed to start postgres {}",
+                    db_start_status.code().unwrap()
+                ));
+            }
+
+            loop {
+                if Instant::now() > db_start_deadline {
+                    return Err(anyhow::anyhow!("Timed out waiting for postgres to start"));
+                }
+
+                match self.pg_isready(&pg_bin_dir, postgres_port).await {
+                    Ok(true) => {
+                        tracing::info!("storage controller postgres is now ready");
+                        break;
+                    }
+                    Ok(false) => {
+                        tokio::time::sleep(Duration::from_millis(100)).await;
+                    }
+                    Err(e) => {
+                        tracing::warn!("Failed to check postgres status: {e}")
+                    }
+                }
+            }
 
             self.setup_database(postgres_port).await?;
         }
@@ -583,15 +621,10 @@ impl StorageController {
         }
 
         let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
-        let pg_bin_dir = self.get_pg_bin_dir().await?;
 
         println!("Stopping storage controller database...");
         let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
-        let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
-            .args(pg_stop_args)
-            .spawn()?
-            .wait()
-            .await?;
+        let stop_status = self.pg_ctl(pg_stop_args).await;
         if !stop_status.success() {
             match self.is_postgres_running().await {
                 Ok(false) => {
@@ -612,14 +645,9 @@ impl StorageController {
 
     async fn is_postgres_running(&self) -> anyhow::Result<bool> {
         let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
-        let pg_bin_dir = self.get_pg_bin_dir().await?;
 
         let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
-        let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
-            .args(pg_status_args)
-            .spawn()?
-            .wait()
-            .await?;
+        let status_exitcode = self.pg_ctl(pg_status_args).await;
 
         // pg_ctl status returns this exit code if postgres is not running: in this case it is
         // fine that stop failed.  Otherwise it is an error that stop failed.

From ac1205c14c3fd655ad6b90b0f4fcd7d3a47938c3 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 23 Oct 2024 19:58:28 +0100
Subject: [PATCH 47/48] pageserver: add metric for number of zeroed pages on
 rel extend (#9492)

## Problem

Filling the gap in with zeroes is annoying for sharded ingest. We are
not sure it even happens in reality.

## Summary of Changes

Add one global counter which tracks how many such gap blocks we filled
on relation extends. We can add more metrics once we understand the
scope.
---
 pageserver/src/metrics.rs   | 6 ++++++
 pageserver/src/walingest.rs | 7 +++++++
 2 files changed, 13 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 3e824b59fb..8f697558d6 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2092,6 +2092,7 @@ pub(crate) struct WalIngestMetrics {
     pub(crate) records_received: IntCounter,
     pub(crate) records_committed: IntCounter,
     pub(crate) records_filtered: IntCounter,
+    pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
 }
 
 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
@@ -2115,6 +2116,11 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
         "Number of WAL records filtered out due to sharding"
     )
     .expect("failed to define a metric"),
+    gap_blocks_zeroed_on_rel_extend: register_int_counter!(
+        "pageserver_gap_blocks_zeroed_on_rel_extend",
+        "Total number of zero gap blocks written on relation extends"
+    )
+    .expect("failed to define a metric"),
 });
 
 pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 95d1f76920..d3e8bf59f2 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1915,7 +1915,9 @@ impl WalIngest {
             modification.put_rel_extend(rel, new_nblocks, ctx).await?;
 
             let mut key = rel_block_to_key(rel, blknum);
+
             // fill the gap with zeros
+            let mut gap_blocks_filled: u64 = 0;
             for gap_blknum in old_nblocks..blknum {
                 key.field6 = gap_blknum;
 
@@ -1924,7 +1926,12 @@ impl WalIngest {
                 }
 
                 modification.put_rel_page_image_zero(rel, gap_blknum)?;
+                gap_blocks_filled += 1;
             }
+
+            WAL_INGEST
+                .gap_blocks_zeroed_on_rel_extend
+                .inc_by(gap_blocks_filled);
         }
         Ok(())
     }

From b86432c29e63e61bbdeb110135101cdec7cfdb86 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 23 Oct 2024 21:52:22 -0600
Subject: [PATCH 48/48] Fix buggy sizeof

A sizeof on a pointer on a 64 bit machine is 8 bytes whereas
Entry::old_name is a 64 byte array of characters. There was most likely
no fallout since the string would start with NUL bytes, but best to fix
nonetheless.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 pgxn/neon/control_plane_connector.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
index 0730c305cb..4713103909 100644
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -767,7 +767,7 @@ HandleDropRole(DropRoleStmt *stmt)
 		entry->type = Op_Delete;
 		entry->password = NULL;
 		if (!found)
-			memset(entry->old_name, 0, sizeof(entry));
+			memset(entry->old_name, 0, sizeof(entry->old_name));
 	}
 }