From 5bb9ba37cc83460c60cf26edeff027f48f17d7d3 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 14 Dec 2023 17:19:36 +0300
Subject: [PATCH 01/19] Fix python list_segments of sk.

Fixes rare test_peer_recovery flakiness as we started to compare tmp control
file.

https://neondb.slack.com/archives/C04KGFVUWUQ/p1702310929657179
---
 test_runner/fixtures/neon_fixtures.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 7dfdd9274d..42e122cefe 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2945,7 +2945,7 @@ class Safekeeper:
         tli_dir = self.timeline_dir(tenant_id, timeline_id)
         segments = []
         for _, _, filenames in os.walk(tli_dir):
-            segments.extend([f for f in filenames if f != "safekeeper.control"])
+            segments.extend([f for f in filenames if not f.startswith("safekeeper.control")])
         segments.sort()
         return segments
 

From 07508fb1108d7fb503471a433395838fd13567ed Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 15 Dec 2023 12:18:22 +0200
Subject: [PATCH 02/19] fix: better Json parsing errors (#6135)

Before any json parsing from the http api only returned errors were per
field errors. Now they are done using `serde_path_to_error`, which at
least helped greatly with the `disk_usage_eviction_run` used for
testing. I don't think this can conflict with anything added in #5310.
---
 Cargo.lock                    | 1 +
 libs/utils/Cargo.toml         | 2 ++
 libs/utils/src/http/json.rs   | 8 ++++++--
 pageserver/src/http/routes.rs | 4 +---
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 71c2c976a7..7bf9209395 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5765,6 +5765,7 @@ dependencies = [
  "serde",
  "serde_assert",
  "serde_json",
+ "serde_path_to_error",
  "serde_with",
  "signal-hook",
  "strum",
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index ccf6f4f2d7..af0414daa2 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -50,6 +50,8 @@ const_format.workspace = true
 # why is it only here? no other crate should use it, streams are rarely needed.
 tokio-stream = { version = "0.1.14" }
 
+serde_path_to_error.workspace = true
+
 [dev-dependencies]
 byteorder.workspace = true
 bytes.workspace = true
diff --git a/libs/utils/src/http/json.rs b/libs/utils/src/http/json.rs
index 70e682cb76..7ca62561fe 100644
--- a/libs/utils/src/http/json.rs
+++ b/libs/utils/src/http/json.rs
@@ -25,8 +25,12 @@ pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
     if body.remaining() == 0 {
         return Ok(None);
     }
-    serde_json::from_reader(body.reader())
-        .context("Failed to parse json request")
+
+    let mut deser = serde_json::de::Deserializer::from_reader(body.reader());
+
+    serde_path_to_error::deserialize(&mut deser)
+        // intentionally stringify because the debug version is not helpful in python logs
+        .map_err(|e| anyhow::anyhow!("Failed to parse json request: {e}"))
         .map(Some)
         .map_err(ApiError::BadRequest)
 }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 14107557d9..102c9d9a6a 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1621,9 +1621,7 @@ async fn disk_usage_eviction_run(
         }
     }
 
-    let config = json_request::<Config>(&mut r)
-        .await
-        .map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?;
+    let config = json_request::<Config>(&mut r).await?;
 
     let usage = Usage {
         config,

From 0fd80484a926ff27275af14c5199d7364b668e56 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 15 Dec 2023 13:59:24 +0200
Subject: [PATCH 03/19] fix: Timeline deletion during busy startup (#6133)

Compaction was holding back timeline deletion because the compaction
lock had been acquired, but the semaphore was waited on. Timeline
deletion was waiting on the same lock for 1500s.

This replaces the
`pageserver::tenant::tasks::concurrent_background_tasks_rate_limit`
(which looks correct) with a simpler `..._permit` which is just an
infallible acquire, which is easier to spot "aah this needs to be raced
with cancellation tokens".

Ref: https://neondb.slack.com/archives/C03F5SM1N02/p1702496912904719
Ref: https://neondb.slack.com/archives/C03F5SM1N02/p1702578093497779
---
 pageserver/src/task_mgr.rs                    | 11 +++-
 pageserver/src/tenant/tasks.rs                | 23 ++------
 pageserver/src/tenant/timeline.rs             | 57 ++++++++++++-------
 .../src/tenant/timeline/eviction_task.rs      | 16 +++---
 4 files changed, 58 insertions(+), 49 deletions(-)

diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index b80a498c82..cb1b2b8011 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -561,9 +561,14 @@ pub async fn shutdown_watcher() {
 /// cancelled. It can however be moved to other tasks, such as `tokio::task::spawn_blocking` or
 /// `tokio::task::JoinSet::spawn`.
 pub fn shutdown_token() -> CancellationToken {
-    SHUTDOWN_TOKEN
-        .try_with(|t| t.clone())
-        .expect("shutdown_token() called in an unexpected task or thread")
+    let res = SHUTDOWN_TOKEN.try_with(|t| t.clone());
+
+    if cfg!(test) {
+        // in tests this method is called from non-taskmgr spawned tasks, and that is all ok.
+        res.unwrap_or_default()
+    } else {
+        res.expect("shutdown_token() called in an unexpected task or thread")
+    }
 }
 
 /// Has the current task been requested to shut down?
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 4b118442f4..7ff1873eda 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -54,29 +54,18 @@ impl BackgroundLoopKind {
     }
 }
 
-pub(crate) enum RateLimitError {
-    Cancelled,
-}
-
-pub(crate) async fn concurrent_background_tasks_rate_limit(
+/// Cancellation safe.
+pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
     loop_kind: BackgroundLoopKind,
     _ctx: &RequestContext,
-    cancel: &CancellationToken,
-) -> Result<impl Drop, RateLimitError> {
+) -> impl Drop {
     let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE
         .with_label_values(&[loop_kind.as_static_str()])
         .guard();
 
-    tokio::select! {
-        permit = CONCURRENT_BACKGROUND_TASKS.acquire() => {
-            match permit {
-                Ok(permit) => Ok(permit),
-                Err(_closed) => unreachable!("we never close the semaphore"),
-            }
-        },
-        _ = cancel.cancelled() => {
-            Err(RateLimitError::Cancelled)
-        }
+    match CONCURRENT_BACKGROUND_TASKS.acquire().await {
+        Ok(permit) => permit,
+        Err(_closed) => unreachable!("we never close the semaphore"),
     }
 }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ac1922ccad..7438215a68 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -51,7 +51,7 @@ use crate::tenant::storage_layer::{
     LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult,
     ValueReconstructState,
 };
-use crate::tenant::tasks::{BackgroundLoopKind, RateLimitError};
+use crate::tenant::tasks::BackgroundLoopKind;
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
@@ -715,19 +715,27 @@ impl Timeline {
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
     ) -> Result<(), CompactionError> {
-        let _g = self.compaction_lock.lock().await;
+        // most likely the cancellation token is from background task, but in tests it could be the
+        // request task as well.
+
+        let prepare = async move {
+            let guard = self.compaction_lock.lock().await;
+
+            let permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
+                BackgroundLoopKind::Compaction,
+                ctx,
+            )
+            .await;
+
+            (guard, permit)
+        };
 
         // this wait probably never needs any "long time spent" logging, because we already nag if
         // compaction task goes over it's period (20s) which is quite often in production.
-        let _permit = match super::tasks::concurrent_background_tasks_rate_limit(
-            BackgroundLoopKind::Compaction,
-            ctx,
-            cancel,
-        )
-        .await
-        {
-            Ok(permit) => permit,
-            Err(RateLimitError::Cancelled) => return Ok(()),
+        let (_guard, _permit) = tokio::select! {
+            tuple = prepare => { tuple },
+            _ = self.cancel.cancelled() => return Ok(()),
+            _ = cancel.cancelled() => return Ok(()),
         };
 
         let last_record_lsn = self.get_last_record_lsn();
@@ -1782,22 +1790,22 @@ impl Timeline {
             let skip_concurrency_limiter = &skip_concurrency_limiter;
             async move {
                 let cancel = task_mgr::shutdown_token();
-                let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit(
+                let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
                     BackgroundLoopKind::InitialLogicalSizeCalculation,
                     background_ctx,
-                    &cancel,
                 );
 
                 use crate::metrics::initial_logical_size::StartCircumstances;
                 let (_maybe_permit, circumstances) = tokio::select! {
-                    res = wait_for_permit => {
-                        match res {
-                            Ok(permit) => (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit),
-                            Err(RateLimitError::Cancelled) => {
-                                return Err(BackgroundCalculationError::Cancelled);
-                            }
-                        }
+                    permit = wait_for_permit => {
+                        (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit)
                     }
+                    _ = self_ref.cancel.cancelled() => {
+                        return Err(BackgroundCalculationError::Cancelled);
+                    }
+                    _ = cancel.cancelled() => {
+                        return Err(BackgroundCalculationError::Cancelled);
+                    },
                     () = skip_concurrency_limiter.cancelled() => {
                         // Some action that is part of a end user interaction requested logical size
                         // => break out of the rate limit
@@ -3852,7 +3860,14 @@ impl Timeline {
     /// within a layer file. We can only remove the whole file if it's fully
     /// obsolete.
     pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
-        let _g = self.gc_lock.lock().await;
+        // this is most likely the background tasks, but it might be the spawned task from
+        // immediate_gc
+        let cancel = crate::task_mgr::shutdown_token();
+        let _g = tokio::select! {
+            guard = self.gc_lock.lock() => guard,
+            _ = self.cancel.cancelled() => return Ok(GcResult::default()),
+            _ = cancel.cancelled() => return Ok(GcResult::default()),
+        };
         let timer = self.metrics.garbage_collect_histo.start_timer();
 
         fail_point!("before-timeline-gc");
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 782e8f9e39..ea5f5f5fa7 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,7 +30,7 @@ use crate::{
     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
     tenant::{
         config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
-        tasks::{BackgroundLoopKind, RateLimitError},
+        tasks::BackgroundLoopKind,
         timeline::EvictionError,
         LogicalSizeCalculationCause, Tenant,
     },
@@ -158,15 +158,15 @@ impl Timeline {
     ) -> ControlFlow<()> {
         let now = SystemTime::now();
 
-        let _permit = match crate::tenant::tasks::concurrent_background_tasks_rate_limit(
+        let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
             BackgroundLoopKind::Eviction,
             ctx,
-            cancel,
-        )
-        .await
-        {
-            Ok(permit) => permit,
-            Err(RateLimitError::Cancelled) => return ControlFlow::Break(()),
+        );
+
+        let _permit = tokio::select! {
+            permit = acquire_permit => permit,
+            _ = cancel.cancelled() => return ControlFlow::Break(()),
+            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
         };
 
         // If we evict layers but keep cached values derived from those layers, then

From 215cdd18c4032adf915332252fcd4a685b9ca406 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 15 Dec 2023 13:11:25 +0100
Subject: [PATCH 04/19] Make initdb upload retries cancellable and seek to
 beginning (#6147)

* initdb uploads had no cancellation token, which means that when we
were stuck in upload retries, we wouldn't be able to delete the
timeline. in general, the combination of retrying forever and not having
cancellation tokens is quite dangerous.
* initdb uploads wouldn't rewind the file. this wasn't discovered in the
purposefully unreliable test-s3 in pytest because those fail on the
first byte always, not somewhere during the connection. we'd be getting
errors from the AWS sdk that the file was at an unexpected end.

slack thread: https://neondb.slack.com/archives/C033RQ5SPDH/p1702632247784079
---
 pageserver/src/tenant.rs                               | 5 +++--
 pageserver/src/tenant/remote_timeline_client/upload.rs | 8 ++++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 969210622c..ace20e44e2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3106,8 +3106,9 @@ impl Tenant {
                     3,
                     u32::MAX,
                     "persist_initdb_tar_zst",
-                    // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-                    backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+                    backoff::Cancel::new(self.cancel.clone(), || {
+                        anyhow::anyhow!("initdb upload cancelled")
+                    }),
                 )
                 .await?;
 
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index d0744e7c83..9f5b3c242b 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -4,8 +4,9 @@ use anyhow::{bail, Context};
 use camino::Utf8Path;
 use fail::fail_point;
 use pageserver_api::shard::TenantShardId;
-use std::io::ErrorKind;
+use std::io::{ErrorKind, SeekFrom};
 use tokio::fs::{self, File};
+use tokio::io::AsyncSeekExt;
 
 use super::Generation;
 use crate::{
@@ -119,11 +120,14 @@ pub(crate) async fn upload_initdb_dir(
     storage: &GenericRemoteStorage,
     tenant_id: &TenantId,
     timeline_id: &TimelineId,
-    initdb_tar_zst: File,
+    mut initdb_tar_zst: File,
     size: u64,
 ) -> anyhow::Result<()> {
     tracing::trace!("uploading initdb dir");
 
+    // We might have read somewhat into the file already in the prior retry attempt
+    initdb_tar_zst.seek(SeekFrom::Start(0)).await?;
+
     let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE);
 
     let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);

From 98629841e093eb059e6f75286cba98b4b93bcf6a Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 15 Dec 2023 12:11:50 +0000
Subject: [PATCH 05/19] improve proxy code cov (#6141)

## Summary of changes

saw some low-hanging codecov improvements. even if code coverage is
somewhat of a pointless game, might as well add tests where we can and
delete code if it's unused
---
 proxy/src/rate_limiter/aimd.rs    | 33 --------------
 proxy/src/rate_limiter/limiter.rs | 69 +++++++++++++++++++-----------
 proxy/src/serverless/websocket.rs | 71 ++++++++++++++++++++++++++++---
 3 files changed, 107 insertions(+), 66 deletions(-)

diff --git a/proxy/src/rate_limiter/aimd.rs b/proxy/src/rate_limiter/aimd.rs
index c6c532ae53..2c14a54a6c 100644
--- a/proxy/src/rate_limiter/aimd.rs
+++ b/proxy/src/rate_limiter/aimd.rs
@@ -33,39 +33,6 @@ impl Aimd {
             min_utilisation_threshold: config.aimd_min_utilisation_threshold,
         }
     }
-
-    pub fn decrease_factor(self, factor: f32) -> Self {
-        assert!((0.5..1.0).contains(&factor));
-        Self {
-            decrease_factor: factor,
-            ..self
-        }
-    }
-
-    pub fn increase_by(self, increase: usize) -> Self {
-        assert!(increase > 0);
-        Self {
-            increase_by: increase,
-            ..self
-        }
-    }
-
-    pub fn with_max_limit(self, max: usize) -> Self {
-        assert!(max > 0);
-        Self {
-            max_limit: max,
-            ..self
-        }
-    }
-
-    /// A threshold below which the limit won't be increased. 0.5 = 50%.
-    pub fn with_min_utilisation_threshold(self, min_util: f32) -> Self {
-        assert!(min_util > 0. && min_util < 1.);
-        Self {
-            min_utilisation_threshold: min_util,
-            ..self
-        }
-    }
 }
 
 #[async_trait]
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 87c1597ca9..8dfdfcd3db 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -1,12 +1,16 @@
-use std::sync::{
-    atomic::{AtomicUsize, Ordering},
-    Arc,
+use std::{
+    collections::hash_map::RandomState,
+    hash::BuildHasher,
+    sync::{
+        atomic::{AtomicUsize, Ordering},
+        Arc, Mutex,
+    },
 };
 
 use anyhow::bail;
 use dashmap::DashMap;
 use itertools::Itertools;
-use rand::{thread_rng, Rng};
+use rand::{rngs::StdRng, Rng, SeedableRng};
 use smol_str::SmolStr;
 use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
 use tokio::time::{timeout, Duration, Instant};
@@ -28,10 +32,11 @@ use super::{
 // saw SNI, before doing TLS handshake. User-side error messages in that case
 // does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now
 // I went with a more expensive way that yields user-friendlier error messages.
-pub struct EndpointRateLimiter {
-    map: DashMap<SmolStr, Vec<RateBucket>>,
+pub struct EndpointRateLimiter<Rand = StdRng, Hasher = RandomState> {
+    map: DashMap<SmolStr, Vec<RateBucket>, Hasher>,
     info: &'static [RateBucketInfo],
     access_count: AtomicUsize,
+    rand: Mutex<Rand>,
 }
 
 #[derive(Clone, Copy)]
@@ -125,11 +130,18 @@ impl RateBucketInfo {
 
 impl EndpointRateLimiter {
     pub fn new(info: &'static [RateBucketInfo]) -> Self {
+        Self::new_with_rand_and_hasher(info, StdRng::from_entropy(), RandomState::new())
+    }
+}
+
+impl<R: Rng, S: BuildHasher + Clone> EndpointRateLimiter<R, S> {
+    fn new_with_rand_and_hasher(info: &'static [RateBucketInfo], rand: R, hasher: S) -> Self {
         info!(buckets = ?info, "endpoint rate limiter");
         Self {
             info,
-            map: DashMap::with_shard_amount(64),
+            map: DashMap::with_hasher_and_shard_amount(hasher, 64),
             access_count: AtomicUsize::new(1), // start from 1 to avoid GC on the first request
+            rand: Mutex::new(rand),
         }
     }
 
@@ -176,7 +188,9 @@ impl EndpointRateLimiter {
             self.map.len()
         );
         let n = self.map.shards().len();
-        let shard = thread_rng().gen_range(0..n);
+        // this lock is ok as the periodic cycle of do_gc makes this very unlikely to collide
+        // (impossible, infact, unless we have 2048 threads)
+        let shard = self.rand.lock().unwrap().gen_range(0..n);
         self.map.shards()[shard].write().clear();
     }
 }
@@ -219,7 +233,6 @@ pub struct Token<'t> {
 #[derive(Debug, Clone, Copy)]
 pub struct LimiterState {
     limit: usize,
-    available: usize,
     in_flight: usize,
 }
 
@@ -397,11 +410,7 @@ impl Limiter {
     pub fn state(&self) -> LimiterState {
         let limit = self.limits.load(Ordering::Relaxed);
         let in_flight = self.in_flight.load(Ordering::Relaxed);
-        LimiterState {
-            limit,
-            available: limit.saturating_sub(in_flight),
-            in_flight,
-        }
+        LimiterState { limit, in_flight }
     }
 }
 
@@ -414,13 +423,6 @@ impl<'t> Token<'t> {
         }
     }
 
-    #[cfg(test)]
-    pub fn set_latency(&mut self, latency: Duration) {
-        use std::ops::Sub;
-
-        self.start = Instant::now().sub(latency);
-    }
-
     pub fn forget(&mut self) {
         if let Some(permit) = self.permit.take() {
             permit.forget();
@@ -439,10 +441,6 @@ impl LimiterState {
     pub fn limit(&self) -> usize {
         self.limit
     }
-    /// The amount of concurrency available to use.
-    pub fn available(&self) -> usize {
-        self.available
-    }
     /// The number of jobs in flight.
     pub fn in_flight(&self) -> usize {
         self.in_flight
@@ -490,9 +488,11 @@ impl reqwest_middleware::Middleware for Limiter {
 
 #[cfg(test)]
 mod tests {
-    use std::{pin::pin, task::Context, time::Duration};
+    use std::{hash::BuildHasherDefault, pin::pin, task::Context, time::Duration};
 
     use futures::{task::noop_waker_ref, Future};
+    use rand::SeedableRng;
+    use rustc_hash::FxHasher;
     use smol_str::SmolStr;
     use tokio::time;
 
@@ -690,4 +690,21 @@ mod tests {
             assert!(limiter.check(endpoint.clone()));
         }
     }
+
+    #[tokio::test]
+    async fn test_rate_limits_gc() {
+        // fixed seeded random/hasher to ensure that the test is not flaky
+        let rand = rand::rngs::StdRng::from_seed([1; 32]);
+        let hasher = BuildHasherDefault::<FxHasher>::default();
+
+        let limiter = EndpointRateLimiter::new_with_rand_and_hasher(
+            &RateBucketInfo::DEFAULT_SET,
+            rand,
+            hasher,
+        );
+        for i in 0..1_000_000 {
+            limiter.check(format!("{i}").into());
+        }
+        assert!(limiter.map.len() < 150_000);
+    }
 }
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index cd6184cdee..071add3bca 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -27,15 +27,15 @@ use sync_wrapper::SyncWrapper;
 pin_project! {
     /// This is a wrapper around a [`WebSocketStream`] that
     /// implements [`AsyncRead`] and [`AsyncWrite`].
-    pub struct WebSocketRw {
+    pub struct WebSocketRw<S = Upgraded> {
         #[pin]
-        stream: SyncWrapper<WebSocketStream<Upgraded>>,
+        stream: SyncWrapper<WebSocketStream<S>>,
         bytes: Bytes,
     }
 }
 
-impl WebSocketRw {
-    pub fn new(stream: WebSocketStream<Upgraded>) -> Self {
+impl<S> WebSocketRw<S> {
+    pub fn new(stream: WebSocketStream<S>) -> Self {
         Self {
             stream: stream.into(),
             bytes: Bytes::new(),
@@ -43,7 +43,7 @@ impl WebSocketRw {
     }
 }
 
-impl AsyncWrite for WebSocketRw {
+impl<S: AsyncRead + AsyncWrite + Unpin> AsyncWrite for WebSocketRw<S> {
     fn poll_write(
         self: Pin<&mut Self>,
         cx: &mut Context<'_>,
@@ -69,7 +69,7 @@ impl AsyncWrite for WebSocketRw {
     }
 }
 
-impl AsyncRead for WebSocketRw {
+impl<S: AsyncRead + AsyncWrite + Unpin> AsyncRead for WebSocketRw<S> {
     fn poll_read(
         mut self: Pin<&mut Self>,
         cx: &mut Context<'_>,
@@ -86,7 +86,7 @@ impl AsyncRead for WebSocketRw {
     }
 }
 
-impl AsyncBufRead for WebSocketRw {
+impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
     fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<&[u8]>> {
         // Please refer to poll_fill_buf's documentation.
         const EOF: Poll<io::Result<&[u8]>> = Poll::Ready(Ok(&[]));
@@ -151,3 +151,60 @@ pub async fn serve_websocket(
     .await?;
     Ok(())
 }
+
+#[cfg(test)]
+mod tests {
+    use std::pin::pin;
+
+    use futures::{SinkExt, StreamExt};
+    use hyper_tungstenite::{
+        tungstenite::{protocol::Role, Message},
+        WebSocketStream,
+    };
+    use tokio::{
+        io::{duplex, AsyncReadExt, AsyncWriteExt},
+        task::JoinSet,
+    };
+
+    use super::WebSocketRw;
+
+    #[tokio::test]
+    async fn websocket_stream_wrapper_happy_path() {
+        let (stream1, stream2) = duplex(1024);
+
+        let mut js = JoinSet::new();
+
+        js.spawn(async move {
+            let mut client = WebSocketStream::from_raw_socket(stream1, Role::Client, None).await;
+
+            client
+                .send(Message::Binary(b"hello world".to_vec()))
+                .await
+                .unwrap();
+
+            let message = client.next().await.unwrap().unwrap();
+            assert_eq!(message, Message::Binary(b"websockets are cool".to_vec()));
+
+            client.close(None).await.unwrap();
+        });
+
+        js.spawn(async move {
+            let mut rw = pin!(WebSocketRw::new(
+                WebSocketStream::from_raw_socket(stream2, Role::Server, None).await
+            ));
+
+            let mut buf = vec![0; 1024];
+            let n = rw.read(&mut buf).await.unwrap();
+            assert_eq!(&buf[..n], b"hello world");
+
+            rw.write_all(b"websockets are cool").await.unwrap();
+            rw.flush().await.unwrap();
+
+            let n = rw.read_to_end(&mut buf).await.unwrap();
+            assert_eq!(n, 0);
+        });
+
+        js.join_next().await.unwrap().unwrap();
+        js.join_next().await.unwrap().unwrap();
+    }
+}

From bd1cb1b217f39c8d109ccca9c285bc1da959620e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Dec 2023 15:08:28 +0000
Subject: [PATCH 06/19] tests: update allow list for `negative_env` (#6144)

Tests attaching the tenant immediately after the fixture detaches it
could result in LSN updates failing validation

e.g.
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-6142/7211196140/index.html#suites/7745dadbd815ab87f5798aa881796f47/32b12ccc0b01b122
---
 test_runner/regress/test_attach_tenant_config.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index d2bd53d8aa..352ec13884 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -52,7 +52,16 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N
         TenantId(t["id"]) for t in ps_http.tenant_list()
     ], "tenant should not be attached after negative test"
 
-    env.pageserver.allowed_errors.append(".*Error processing HTTP request: Bad request")
+    env.pageserver.allowed_errors.extend(
+        [
+            # This fixture detaches the tenant, and tests using it will tend to re-attach it
+            # shortly after. There may be un-processed deletion_queue validations from the
+            # initial attachment
+            ".*Dropped remote consistent LSN updates.*",
+            # This fixture is for tests that will intentionally generate 400 responses
+            ".*Error processing HTTP request: Bad request",
+        ]
+    )
 
     def log_contains_bad_request():
         env.pageserver.log_contains(".*Error processing HTTP request: Bad request")

From e62569a8788322d04d9eb397c1c35a670df37fdb Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 12 Dec 2023 00:22:23 +0300
Subject: [PATCH 07/19] A few comments on rust walproposer build.

---
 libs/walproposer/build.rs            | 3 +++
 libs/walproposer/src/api_bindings.rs | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs
index d32c8ab299..fd09030dbd 100644
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -1,3 +1,6 @@
+//! Links with walproposer, pgcommon, pgport and runs bindgen on walproposer.h
+//! to generate Rust bindings for it.
+
 use std::{env, path::PathBuf, process::Command};
 
 use anyhow::{anyhow, Context};
diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index 7f1bbc3b80..77afe1e686 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -1,3 +1,6 @@
+//! A C-Rust shim: defines implementation of C walproposer API, assuming wp
+//! callback_data stores Box to some Rust implementation.
+
 #![allow(dead_code)]
 
 use std::ffi::CStr;

From de1a9c6e3b5d40df8c36a0a77f9d2cfad1c3e35b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Dec 2023 15:48:55 +0000
Subject: [PATCH 08/19] s3_scrubber: basic support for sharding (#6119)

This doesn't make the scrubber smart enough to understand that many
shards are part of the same tenants, but it makes it understand paths
well enough to scrub the individual shards without thinking they're
malformed.

This is a prerequisite to being able to run tests with sharding enabled.

Related: #5929
---
 Cargo.lock                         |  1 +
 s3_scrubber/Cargo.toml             |  1 +
 s3_scrubber/src/checks.rs          |  9 +++----
 s3_scrubber/src/garbage.rs         | 38 +++++++++++++++-------------
 s3_scrubber/src/lib.rs             | 40 ++++++++++++++++++++++++++----
 s3_scrubber/src/metadata_stream.rs | 15 +++++------
 s3_scrubber/src/scan_metadata.rs   | 18 ++++++--------
 7 files changed, 78 insertions(+), 44 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7bf9209395..08e6961408 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4292,6 +4292,7 @@ dependencies = [
  "histogram",
  "itertools",
  "pageserver",
+ "pageserver_api",
  "rand 0.8.5",
  "remote_storage",
  "reqwest",
diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml
index e26f2c6d6b..fdae378d55 100644
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -31,6 +31,7 @@ reqwest = { workspace = true, default-features = false, features = ["rustls-tls"
 aws-config = { workspace = true, default-features = false, features = ["rustls", "sso"] }
 
 pageserver = { path = "../pageserver" }
+pageserver_api = { path = "../libs/pageserver_api" }
 remote_storage = { path = "../libs/remote_storage" }
 
 tracing.workspace = true
diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs
index a15a908212..2acbb2352b 100644
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -7,13 +7,12 @@ use utils::generation::Generation;
 
 use crate::cloud_admin_api::BranchData;
 use crate::metadata_stream::stream_listing;
-use crate::{download_object_with_retries, RootTarget};
+use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
 use futures_util::{pin_mut, StreamExt};
 use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
 use pageserver::tenant::storage_layer::LayerFileName;
 use pageserver::tenant::IndexPart;
 use remote_storage::RemotePath;
-use utils::id::TenantTimelineId;
 
 pub(crate) struct TimelineAnalysis {
     /// Anomalies detected
@@ -39,8 +38,8 @@ impl TimelineAnalysis {
     }
 }
 
-pub(crate) async fn branch_cleanup_and_check_errors(
-    id: &TenantTimelineId,
+pub(crate) fn branch_cleanup_and_check_errors(
+    id: &TenantShardTimelineId,
     s3_root: &RootTarget,
     s3_active_branch: Option<&BranchData>,
     console_branch: Option<BranchData>,
@@ -238,7 +237,7 @@ fn parse_layer_object_name(name: &str) -> Result<(LayerFileName, Generation), St
 
 pub(crate) async fn list_timeline_blobs(
     s3_client: &Client,
-    id: TenantTimelineId,
+    id: TenantShardTimelineId,
     s3_root: &RootTarget,
 ) -> anyhow::Result<S3TimelineBlobData> {
     let mut s3_layers = HashSet::new();
diff --git a/s3_scrubber/src/garbage.rs b/s3_scrubber/src/garbage.rs
index f27e1d7f65..7192afb91b 100644
--- a/s3_scrubber/src/garbage.rs
+++ b/s3_scrubber/src/garbage.rs
@@ -10,15 +10,16 @@ use aws_sdk_s3::{
     Client,
 };
 use futures_util::{pin_mut, TryStreamExt};
+use pageserver_api::shard::TenantShardId;
 use serde::{Deserialize, Serialize};
 use tokio_stream::StreamExt;
-use utils::id::{TenantId, TenantTimelineId};
+use utils::id::TenantId;
 
 use crate::{
     cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
     init_remote,
     metadata_stream::{stream_listing, stream_tenant_timelines, stream_tenants},
-    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TraversingDepth,
+    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, TraversingDepth,
 };
 
 #[derive(Serialize, Deserialize, Debug)]
@@ -29,8 +30,8 @@ enum GarbageReason {
 
 #[derive(Serialize, Deserialize, Debug)]
 enum GarbageEntity {
-    Tenant(TenantId),
-    Timeline(TenantTimelineId),
+    Tenant(TenantShardId),
+    Timeline(TenantShardTimelineId),
 }
 
 #[derive(Serialize, Deserialize, Debug)]
@@ -142,6 +143,9 @@ async fn find_garbage_inner(
         console_projects.len()
     );
 
+    // TODO(sharding): batch calls into Console so that we only call once for each TenantId,
+    // rather than checking the same TenantId for multiple TenantShardId
+
     // Enumerate Tenants in S3, and check if each one exists in Console
     tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket);
     let tenants = stream_tenants(&s3_client, &target);
@@ -149,10 +153,10 @@ async fn find_garbage_inner(
         let api_client = cloud_admin_api_client.clone();
         let console_projects = &console_projects;
         async move {
-            match console_projects.get(&t) {
+            match console_projects.get(&t.tenant_id) {
                 Some(project_data) => Ok((t, Some(project_data.clone()))),
                 None => api_client
-                    .find_tenant_project(t)
+                    .find_tenant_project(t.tenant_id)
                     .await
                     .map_err(|e| anyhow::anyhow!(e))
                     .map(|r| (t, r)),
@@ -166,21 +170,21 @@ async fn find_garbage_inner(
     // checks if they are enabled by the `depth` parameter.
     pin_mut!(tenants_checked);
     let mut garbage = GarbageList::new(node_kind, bucket_config);
-    let mut active_tenants: Vec<TenantId> = vec![];
+    let mut active_tenants: Vec<TenantShardId> = vec![];
     let mut counter = 0;
     while let Some(result) = tenants_checked.next().await {
-        let (tenant_id, console_result) = result?;
+        let (tenant_shard_id, console_result) = result?;
 
         // Paranoia check
         if let Some(project) = &console_result {
-            assert!(project.tenant == tenant_id);
+            assert!(project.tenant == tenant_shard_id.tenant_id);
         }
 
-        if garbage.maybe_append(GarbageEntity::Tenant(tenant_id), console_result) {
-            tracing::debug!("Tenant {tenant_id} is garbage");
+        if garbage.maybe_append(GarbageEntity::Tenant(tenant_shard_id), console_result) {
+            tracing::debug!("Tenant {tenant_shard_id} is garbage");
         } else {
-            tracing::debug!("Tenant {tenant_id} is active");
-            active_tenants.push(tenant_id);
+            tracing::debug!("Tenant {tenant_shard_id} is active");
+            active_tenants.push(tenant_shard_id);
         }
 
         counter += 1;
@@ -266,13 +270,13 @@ impl std::fmt::Display for PurgeMode {
 pub async fn get_tenant_objects(
     s3_client: &Arc<Client>,
     target: RootTarget,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
 ) -> anyhow::Result<Vec<ObjectIdentifier>> {
-    tracing::debug!("Listing objects in tenant {tenant_id}");
+    tracing::debug!("Listing objects in tenant {tenant_shard_id}");
     // TODO: apply extra validation based on object modification time.  Don't purge
     // tenants where any timeline's index_part.json has been touched recently.
 
-    let mut tenant_root = target.tenant_root(&tenant_id);
+    let mut tenant_root = target.tenant_root(&tenant_shard_id);
 
     // Remove delimiter, so that object listing lists all keys in the prefix and not just
     // common prefixes.
@@ -285,7 +289,7 @@ pub async fn get_tenant_objects(
 pub async fn get_timeline_objects(
     s3_client: &Arc<Client>,
     target: RootTarget,
-    ttid: TenantTimelineId,
+    ttid: TenantShardTimelineId,
 ) -> anyhow::Result<Vec<ObjectIdentifier>> {
     tracing::debug!("Listing objects in timeline {ttid}");
     let mut timeline_root = target.timeline_root(&ttid);
diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index 6607db21e6..d2338c21e5 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -22,6 +22,7 @@ use aws_sdk_s3::{Client, Config};
 
 use clap::ValueEnum;
 use pageserver::tenant::TENANTS_SEGMENT_NAME;
+use pageserver_api::shard::TenantShardId;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
 use std::io::IsTerminal;
@@ -29,7 +30,7 @@ use tokio::io::AsyncReadExt;
 use tracing::error;
 use tracing_appender::non_blocking::WorkerGuard;
 use tracing_subscriber::{fmt, prelude::*, EnvFilter};
-use utils::id::{TenantId, TenantTimelineId};
+use utils::id::TimelineId;
 
 const MAX_RETRIES: usize = 20;
 const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN";
@@ -44,6 +45,35 @@ pub struct S3Target {
     pub delimiter: String,
 }
 
+/// Convenience for referring to timelines within a particular shard: more ergonomic
+/// than using a 2-tuple.
+///
+/// This is the shard-aware equivalent of TenantTimelineId.  It's defined here rather
+/// than somewhere more broadly exposed, because this kind of thing is rarely needed
+/// in the pageserver, as all timeline objects existing in the scope of a particular
+/// tenant: the scrubber is different in that it handles collections of data referring to many
+/// TenantShardTimelineIds in on place.
+#[derive(Serialize, Deserialize, Debug, Clone, Copy, Hash, PartialEq, Eq)]
+pub struct TenantShardTimelineId {
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
+}
+
+impl TenantShardTimelineId {
+    fn new(tenant_shard_id: TenantShardId, timeline_id: TimelineId) -> Self {
+        Self {
+            tenant_shard_id,
+            timeline_id,
+        }
+    }
+}
+
+impl Display for TenantShardTimelineId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}/{}", self.tenant_shard_id, self.timeline_id)
+    }
+}
+
 #[derive(clap::ValueEnum, Debug, Clone, Copy, PartialEq, Eq)]
 pub enum TraversingDepth {
     Tenant,
@@ -110,19 +140,19 @@ impl RootTarget {
         }
     }
 
-    pub fn tenant_root(&self, tenant_id: &TenantId) -> S3Target {
+    pub fn tenant_root(&self, tenant_id: &TenantShardId) -> S3Target {
         self.tenants_root().with_sub_segment(&tenant_id.to_string())
     }
 
-    pub fn timelines_root(&self, tenant_id: &TenantId) -> S3Target {
+    pub fn timelines_root(&self, tenant_id: &TenantShardId) -> S3Target {
         match self {
             Self::Pageserver(_) => self.tenant_root(tenant_id).with_sub_segment("timelines"),
             Self::Safekeeper(_) => self.tenant_root(tenant_id),
         }
     }
 
-    pub fn timeline_root(&self, id: &TenantTimelineId) -> S3Target {
-        self.timelines_root(&id.tenant_id)
+    pub fn timeline_root(&self, id: &TenantShardTimelineId) -> S3Target {
+        self.timelines_root(&id.tenant_shard_id)
             .with_sub_segment(&id.timeline_id.to_string())
     }
 
diff --git a/s3_scrubber/src/metadata_stream.rs b/s3_scrubber/src/metadata_stream.rs
index 4cfa77cfc1..073f37f319 100644
--- a/s3_scrubber/src/metadata_stream.rs
+++ b/s3_scrubber/src/metadata_stream.rs
@@ -3,14 +3,15 @@ use async_stream::{stream, try_stream};
 use aws_sdk_s3::{types::ObjectIdentifier, Client};
 use tokio_stream::Stream;
 
-use crate::{list_objects_with_retries, RootTarget, S3Target, TenantId};
-use utils::id::{TenantTimelineId, TimelineId};
+use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId};
+use pageserver_api::shard::TenantShardId;
+use utils::id::TimelineId;
 
 /// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
 pub fn stream_tenants<'a>(
     s3_client: &'a Client,
     target: &'a RootTarget,
-) -> impl Stream<Item = anyhow::Result<TenantId>> + 'a {
+) -> impl Stream<Item = anyhow::Result<TenantShardId>> + 'a {
     try_stream! {
         let mut continuation_token = None;
         let tenants_target = target.tenants_root();
@@ -44,14 +45,14 @@ pub fn stream_tenants<'a>(
     }
 }
 
-/// Given a TenantId, output a stream of the timelines within that tenant, discovered
+/// Given a TenantShardId, output a stream of the timelines within that tenant, discovered
 /// using ListObjectsv2.  The listing is done before the stream is built, so that this
 /// function can be used to generate concurrency on a stream using buffer_unordered.
 pub async fn stream_tenant_timelines<'a>(
     s3_client: &'a Client,
     target: &'a RootTarget,
-    tenant: TenantId,
-) -> anyhow::Result<impl Stream<Item = Result<TenantTimelineId, anyhow::Error>> + 'a> {
+    tenant: TenantShardId,
+) -> anyhow::Result<impl Stream<Item = Result<TenantShardTimelineId, anyhow::Error>> + 'a> {
     let mut timeline_ids: Vec<Result<TimelineId, anyhow::Error>> = Vec::new();
     let mut continuation_token = None;
     let timelines_target = target.timelines_root(&tenant);
@@ -98,7 +99,7 @@ pub async fn stream_tenant_timelines<'a>(
     Ok(stream! {
         for i in timeline_ids {
             let id = i?;
-            yield Ok(TenantTimelineId::new(tenant, id));
+            yield Ok(TenantShardTimelineId::new(tenant, id));
         }
     })
 }
diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs
index 228f8d6763..91347ca21b 100644
--- a/s3_scrubber/src/scan_metadata.rs
+++ b/s3_scrubber/src/scan_metadata.rs
@@ -5,20 +5,19 @@ use crate::checks::{
     TimelineAnalysis,
 };
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
-use crate::{init_remote, BucketConfig, NodeKind, RootTarget};
+use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use aws_sdk_s3::Client;
 use futures_util::{pin_mut, StreamExt, TryStreamExt};
 use histogram::Histogram;
 use pageserver::tenant::IndexPart;
 use serde::Serialize;
-use utils::id::TenantTimelineId;
 
 #[derive(Serialize)]
 pub struct MetadataSummary {
     count: usize,
-    with_errors: HashSet<TenantTimelineId>,
-    with_warnings: HashSet<TenantTimelineId>,
-    with_garbage: HashSet<TenantTimelineId>,
+    with_errors: HashSet<TenantShardTimelineId>,
+    with_warnings: HashSet<TenantShardTimelineId>,
+    with_garbage: HashSet<TenantShardTimelineId>,
     indices_by_version: HashMap<usize, usize>,
 
     layer_count: MinMaxHisto,
@@ -132,7 +131,7 @@ impl MetadataSummary {
         }
     }
 
-    fn update_analysis(&mut self, id: &TenantTimelineId, analysis: &TimelineAnalysis) {
+    fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) {
         if !analysis.errors.is_empty() {
             self.with_errors.insert(*id);
         }
@@ -199,8 +198,8 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<Metada
     async fn report_on_timeline(
         s3_client: &Client,
         target: &RootTarget,
-        ttid: TenantTimelineId,
-    ) -> anyhow::Result<(TenantTimelineId, S3TimelineBlobData)> {
+        ttid: TenantShardTimelineId,
+    ) -> anyhow::Result<(TenantShardTimelineId, S3TimelineBlobData)> {
         let data = list_timeline_blobs(s3_client, ttid, target).await?;
         Ok((ttid, data))
     }
@@ -213,8 +212,7 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<Metada
         let (ttid, data) = i?;
         summary.update_data(&data);
 
-        let analysis =
-            branch_cleanup_and_check_errors(&ttid, &target, None, None, Some(data)).await;
+        let analysis = branch_cleanup_and_check_errors(&ttid, &target, None, None, Some(data));
 
         summary.update_analysis(&ttid, &analysis);
     }

From 1a9854bfb76712a2d72b10db966537ecbe518ab5 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 15 Dec 2023 18:33:45 +0100
Subject: [PATCH 09/19] add a Rust client for Pageserver management API (#6127)

Part of getpage@lsn benchmark epic:
https://github.com/neondatabase/neon/issues/5771

This PR moves the control plane's spread-all-over-the-place client for
the pageserver management API into a separate module within the
pageserver crate.

I need that client to be async in my benchmarking work, so, this PR
switches to the async version of `reqwest`.
That is also the right direction generally IMO.

The switch to async in turn mandated converting most of the
`control_plane/` code to async.

Note that some of the client methods should be taking `TenantShardId`
instead of `TenantId`, but, none of the callers seem to be
sharding-aware.
Leaving that for another time:
https://github.com/neondatabase/neon/issues/6154
---
 Cargo.lock                              |  18 ++
 Cargo.toml                              |   2 +
 control_plane/Cargo.toml                |   7 +-
 control_plane/src/attachment_service.rs |  23 +-
 control_plane/src/background_process.rs |  35 ++-
 control_plane/src/bin/neon_local.rs     | 163 ++++++++------
 control_plane/src/broker.rs             |  11 +-
 control_plane/src/endpoint.rs           |  24 +-
 control_plane/src/pageserver.rs         | 282 +++++++++---------------
 control_plane/src/safekeeper.rs         |  40 ++--
 control_plane/src/tenant_migration.rs   |  42 ++--
 libs/postgres_connection/src/lib.rs     |  14 +-
 pageserver/client/Cargo.toml            |  14 ++
 pageserver/client/src/lib.rs            |   1 +
 pageserver/client/src/mgmt_api.rs       | 172 +++++++++++++++
 15 files changed, 529 insertions(+), 319 deletions(-)
 create mode 100644 pageserver/client/Cargo.toml
 create mode 100644 pageserver/client/src/lib.rs
 create mode 100644 pageserver/client/src/mgmt_api.rs

diff --git a/Cargo.lock b/Cargo.lock
index 08e6961408..99a936dd40 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1245,16 +1245,19 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "async-trait",
  "camino",
  "clap",
  "comfy-table",
  "compute_api",
+ "futures",
  "git-version",
  "hex",
  "hyper",
  "nix 0.26.2",
  "once_cell",
  "pageserver_api",
+ "pageserver_client",
  "postgres",
  "postgres_backend",
  "postgres_connection",
@@ -1268,6 +1271,8 @@ dependencies = [
  "tar",
  "thiserror",
  "tokio",
+ "tokio-postgres",
+ "tokio-util",
  "toml",
  "tracing",
  "url",
@@ -3172,6 +3177,19 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "pageserver_client"
+version = "0.1.0"
+dependencies = [
+ "async-trait",
+ "pageserver_api",
+ "reqwest",
+ "serde",
+ "thiserror",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "parking"
 version = "2.1.1"
diff --git a/Cargo.toml b/Cargo.toml
index 496a9d7839..b44544d626 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,6 +5,7 @@ members = [
     "control_plane",
     "pageserver",
     "pageserver/ctl",
+    "pageserver/client",
     "proxy",
     "safekeeper",
     "storage_broker",
@@ -182,6 +183,7 @@ compute_api = { version = "0.1", path = "./libs/compute_api/" }
 consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
+pageserver_client = { path = "./pageserver/client" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 7ccddc161e..898ad05add 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -6,9 +6,11 @@ license.workspace = true
 
 [dependencies]
 anyhow.workspace = true
+async-trait.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
+futures.workspace = true
 git-version.workspace = true
 nix.workspace = true
 once_cell.workspace = true
@@ -24,10 +26,11 @@ tar.workspace = true
 thiserror.workspace = true
 toml.workspace = true
 tokio.workspace = true
+tokio-postgres.workspace = true
+tokio-util.workspace = true
 url.workspace = true
-# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
-# instead, so that recompile times are better.
 pageserver_api.workspace = true
+pageserver_client.workspace = true
 postgres_backend.workspace = true
 safekeeper_api.workspace = true
 postgres_connection.workspace = true
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 822ac7d8a6..731c05809e 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -9,7 +9,7 @@ pub struct AttachmentService {
     env: LocalEnv,
     listen: String,
     path: PathBuf,
-    client: reqwest::blocking::Client,
+    client: reqwest::Client,
 }
 
 const COMMAND: &str = "attachment_service";
@@ -53,7 +53,7 @@ impl AttachmentService {
             env: env.clone(),
             path,
             listen,
-            client: reqwest::blocking::ClientBuilder::new()
+            client: reqwest::ClientBuilder::new()
                 .build()
                 .expect("Failed to construct http client"),
         }
@@ -64,7 +64,7 @@ impl AttachmentService {
             .expect("non-Unicode path")
     }
 
-    pub fn start(&self) -> anyhow::Result<Child> {
+    pub async fn start(&self) -> anyhow::Result<Child> {
         let path_str = self.path.to_string_lossy();
 
         background_process::start_process(
@@ -73,10 +73,11 @@ impl AttachmentService {
             &self.env.attachment_service_bin(),
             ["-l", &self.listen, "-p", &path_str],
             [],
-            background_process::InitialPidFile::Create(&self.pid_file()),
+            background_process::InitialPidFile::Create(self.pid_file()),
             // TODO: a real status check
-            || Ok(true),
+            || async move { anyhow::Ok(true) },
         )
+        .await
     }
 
     pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
@@ -84,7 +85,7 @@ impl AttachmentService {
     }
 
     /// Call into the attach_hook API, for use before handing out attachments to pageservers
-    pub fn attach_hook(
+    pub async fn attach_hook(
         &self,
         tenant_id: TenantId,
         pageserver_id: NodeId,
@@ -104,16 +105,16 @@ impl AttachmentService {
             node_id: Some(pageserver_id),
         };
 
-        let response = self.client.post(url).json(&request).send()?;
+        let response = self.client.post(url).json(&request).send().await?;
         if response.status() != StatusCode::OK {
             return Err(anyhow!("Unexpected status {}", response.status()));
         }
 
-        let response = response.json::<AttachHookResponse>()?;
+        let response = response.json::<AttachHookResponse>().await?;
         Ok(response.gen)
     }
 
-    pub fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
+    pub async fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
         use hyper::StatusCode;
 
         let url = self
@@ -126,12 +127,12 @@ impl AttachmentService {
 
         let request = InspectRequest { tenant_id };
 
-        let response = self.client.post(url).json(&request).send()?;
+        let response = self.client.post(url).json(&request).send().await?;
         if response.status() != StatusCode::OK {
             return Err(anyhow!("Unexpected status {}", response.status()));
         }
 
-        let response = response.json::<InspectResponse>()?;
+        let response = response.json::<InspectResponse>().await?;
         Ok(response.attachment)
     }
 }
diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index 26fc08fc8f..20fa3af9b8 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -44,15 +44,15 @@ const NOTICE_AFTER_RETRIES: u64 = 50;
 
 /// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
 /// it itself.
-pub enum InitialPidFile<'t> {
+pub enum InitialPidFile {
     /// Create a pidfile, to allow future CLI invocations to manipulate the process.
-    Create(&'t Utf8Path),
+    Create(Utf8PathBuf),
     /// The process will create the pidfile itself, need to wait for that event.
-    Expect(&'t Utf8Path),
+    Expect(Utf8PathBuf),
 }
 
 /// Start a background child process using the parameters given.
-pub fn start_process<F, AI, A, EI>(
+pub async fn start_process<F, Fut, AI, A, EI>(
     process_name: &str,
     datadir: &Path,
     command: &Path,
@@ -62,7 +62,8 @@ pub fn start_process<F, AI, A, EI>(
     process_status_check: F,
 ) -> anyhow::Result<Child>
 where
-    F: Fn() -> anyhow::Result<bool>,
+    F: Fn() -> Fut,
+    Fut: std::future::Future<Output = anyhow::Result<bool>>,
     AI: IntoIterator<Item = A>,
     A: AsRef<OsStr>,
     // Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
@@ -89,7 +90,7 @@ where
     let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
     filled_cmd.envs(envs);
 
-    let pid_file_to_check = match initial_pid_file {
+    let pid_file_to_check = match &initial_pid_file {
         InitialPidFile::Create(path) => {
             pre_exec_create_pidfile(filled_cmd, path);
             path
@@ -107,7 +108,7 @@ where
     );
 
     for retries in 0..RETRIES {
-        match process_started(pid, Some(pid_file_to_check), &process_status_check) {
+        match process_started(pid, pid_file_to_check, &process_status_check).await {
             Ok(true) => {
                 println!("\n{process_name} started, pid: {pid}");
                 return Ok(spawned_process);
@@ -316,22 +317,20 @@ where
     cmd
 }
 
-fn process_started<F>(
+async fn process_started<F, Fut>(
     pid: Pid,
-    pid_file_to_check: Option<&Utf8Path>,
+    pid_file_to_check: &Utf8Path,
     status_check: &F,
 ) -> anyhow::Result<bool>
 where
-    F: Fn() -> anyhow::Result<bool>,
+    F: Fn() -> Fut,
+    Fut: std::future::Future<Output = anyhow::Result<bool>>,
 {
-    match status_check() {
-        Ok(true) => match pid_file_to_check {
-            Some(pid_file_path) => match pid_file::read(pid_file_path)? {
-                PidFileRead::NotExist => Ok(false),
-                PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid),
-                PidFileRead::NotHeldByAnyProcess(_) => Ok(false),
-            },
-            None => Ok(true),
+    match status_check().await {
+        Ok(true) => match pid_file::read(pid_file_to_check)? {
+            PidFileRead::NotExist => Ok(false),
+            PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid),
+            PidFileRead::NotHeldByAnyProcess(_) => Ok(false),
         },
         Ok(false) => Ok(false),
         Err(e) => anyhow::bail!("process failed to start: {e}"),
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 6f0b929ac6..03e69010f7 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -120,15 +120,20 @@ fn main() -> Result<()> {
         let mut env = LocalEnv::load_config().context("Error loading config")?;
         let original_env = env.clone();
 
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .unwrap();
+
         let subcommand_result = match sub_name {
-            "tenant" => handle_tenant(sub_args, &mut env),
-            "timeline" => handle_timeline(sub_args, &mut env),
-            "start" => handle_start_all(sub_args, &env),
+            "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
+            "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
+            "start" => rt.block_on(handle_start_all(sub_args, &env)),
             "stop" => handle_stop_all(sub_args, &env),
-            "pageserver" => handle_pageserver(sub_args, &env),
-            "attachment_service" => handle_attachment_service(sub_args, &env),
-            "safekeeper" => handle_safekeeper(sub_args, &env),
-            "endpoint" => handle_endpoint(sub_args, &env),
+            "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
+            "attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
+            "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
+            "endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
             "mappings" => handle_mappings(sub_args, &mut env),
             "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
             _ => bail!("unexpected subcommand {sub_name}"),
@@ -269,12 +274,13 @@ fn print_timeline(
 
 /// Returns a map of timeline IDs to timeline_id@lsn strings.
 /// Connects to the pageserver to query this information.
-fn get_timeline_infos(
+async fn get_timeline_infos(
     env: &local_env::LocalEnv,
     tenant_id: &TenantId,
 ) -> Result<HashMap<TimelineId, TimelineInfo>> {
     Ok(get_default_pageserver(env)
-        .timeline_list(tenant_id)?
+        .timeline_list(tenant_id)
+        .await?
         .into_iter()
         .map(|timeline_info| (timeline_info.timeline_id, timeline_info))
         .collect())
@@ -373,11 +379,14 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
         .collect()
 }
 
-fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> anyhow::Result<()> {
+async fn handle_tenant(
+    tenant_match: &ArgMatches,
+    env: &mut local_env::LocalEnv,
+) -> anyhow::Result<()> {
     let pageserver = get_default_pageserver(env);
     match tenant_match.subcommand() {
         Some(("list", _)) => {
-            for t in pageserver.tenant_list()? {
+            for t in pageserver.tenant_list().await? {
                 println!("{} {:?}", t.id, t.state);
             }
         }
@@ -394,12 +403,16 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                 // We must register the tenant with the attachment service, so
                 // that when the pageserver restarts, it will be re-attached.
                 let attachment_service = AttachmentService::from_env(env);
-                attachment_service.attach_hook(tenant_id, pageserver.conf.id)?
+                attachment_service
+                    .attach_hook(tenant_id, pageserver.conf.id)
+                    .await?
             } else {
                 None
             };
 
-            pageserver.tenant_create(tenant_id, generation, tenant_conf)?;
+            pageserver
+                .tenant_create(tenant_id, generation, tenant_conf)
+                .await?;
             println!("tenant {tenant_id} successfully created on the pageserver");
 
             // Create an initial timeline for the new tenant
@@ -409,14 +422,16 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                 .copied()
                 .context("Failed to parse postgres version from the argument string")?;
 
-            let timeline_info = pageserver.timeline_create(
-                tenant_id,
-                new_timeline_id,
-                None,
-                None,
-                Some(pg_version),
-                None,
-            )?;
+            let timeline_info = pageserver
+                .timeline_create(
+                    tenant_id,
+                    new_timeline_id,
+                    None,
+                    None,
+                    Some(pg_version),
+                    None,
+                )
+                .await?;
             let new_timeline_id = timeline_info.timeline_id;
             let last_record_lsn = timeline_info.last_record_lsn;
 
@@ -450,6 +465,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
 
             pageserver
                 .tenant_config(tenant_id, tenant_conf)
+                .await
                 .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
             println!("tenant {tenant_id} successfully configured on the pageserver");
         }
@@ -458,7 +474,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
             let new_pageserver = get_pageserver(env, matches)?;
             let new_pageserver_id = new_pageserver.conf.id;
 
-            migrate_tenant(env, tenant_id, new_pageserver)?;
+            migrate_tenant(env, tenant_id, new_pageserver).await?;
             println!("tenant {tenant_id} migrated to {}", new_pageserver_id);
         }
 
@@ -468,13 +484,13 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
     Ok(())
 }
 
-fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
+async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
     let pageserver = get_default_pageserver(env);
 
     match timeline_match.subcommand() {
         Some(("list", list_match)) => {
             let tenant_id = get_tenant_id(list_match, env)?;
-            let timelines = pageserver.timeline_list(&tenant_id)?;
+            let timelines = pageserver.timeline_list(&tenant_id).await?;
             print_timelines_tree(timelines, env.timeline_name_mappings())?;
         }
         Some(("create", create_match)) => {
@@ -490,14 +506,16 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
 
             let new_timeline_id_opt = parse_timeline_id(create_match)?;
 
-            let timeline_info = pageserver.timeline_create(
-                tenant_id,
-                new_timeline_id_opt,
-                None,
-                None,
-                Some(pg_version),
-                None,
-            )?;
+            let timeline_info = pageserver
+                .timeline_create(
+                    tenant_id,
+                    new_timeline_id_opt,
+                    None,
+                    None,
+                    Some(pg_version),
+                    None,
+                )
+                .await?;
             let new_timeline_id = timeline_info.timeline_id;
 
             let last_record_lsn = timeline_info.last_record_lsn;
@@ -542,7 +560,9 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
 
             let mut cplane = ComputeControlPlane::load(env.clone())?;
             println!("Importing timeline into pageserver ...");
-            pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)?;
+            pageserver
+                .timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)
+                .await?;
             env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?;
 
             println!("Creating endpoint for imported timeline ...");
@@ -578,14 +598,16 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                 .map(|lsn_str| Lsn::from_str(lsn_str))
                 .transpose()
                 .context("Failed to parse ancestor start Lsn from the request")?;
-            let timeline_info = pageserver.timeline_create(
-                tenant_id,
-                None,
-                start_lsn,
-                Some(ancestor_timeline_id),
-                None,
-                None,
-            )?;
+            let timeline_info = pageserver
+                .timeline_create(
+                    tenant_id,
+                    None,
+                    start_lsn,
+                    Some(ancestor_timeline_id),
+                    None,
+                    None,
+                )
+                .await?;
             let new_timeline_id = timeline_info.timeline_id;
 
             let last_record_lsn = timeline_info.last_record_lsn;
@@ -604,7 +626,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
     Ok(())
 }
 
-fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
     let (sub_name, sub_args) = match ep_match.subcommand() {
         Some(ep_subcommand_data) => ep_subcommand_data,
         None => bail!("no endpoint subcommand provided"),
@@ -614,10 +636,12 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
     match sub_name {
         "list" => {
             let tenant_id = get_tenant_id(sub_args, env)?;
-            let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| {
-                eprintln!("Failed to load timeline info: {}", e);
-                HashMap::new()
-            });
+            let timeline_infos = get_timeline_infos(env, &tenant_id)
+                .await
+                .unwrap_or_else(|e| {
+                    eprintln!("Failed to load timeline info: {}", e);
+                    HashMap::new()
+                });
 
             let timeline_name_mappings = env.timeline_name_mappings();
 
@@ -791,7 +815,9 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
             };
 
             println!("Starting existing endpoint {endpoint_id}...");
-            endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
+            endpoint
+                .start(&auth_token, safekeepers, remote_ext_config)
+                .await?;
         }
         "reconfigure" => {
             let endpoint_id = sub_args
@@ -809,7 +835,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                 } else {
                     None
                 };
-            endpoint.reconfigure(pageserver_id)?;
+            endpoint.reconfigure(pageserver_id).await?;
         }
         "stop" => {
             let endpoint_id = sub_args
@@ -875,11 +901,12 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
     ))
 }
 
-fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
     match sub_match.subcommand() {
         Some(("start", subcommand_args)) => {
             if let Err(e) = get_pageserver(env, subcommand_args)?
                 .start(&pageserver_config_overrides(subcommand_args))
+                .await
             {
                 eprintln!("pageserver start failed: {e}");
                 exit(1);
@@ -906,7 +933,10 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
                 exit(1);
             }
 
-            if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
+            if let Err(e) = pageserver
+                .start(&pageserver_config_overrides(subcommand_args))
+                .await
+            {
                 eprintln!("pageserver start failed: {e}");
                 exit(1);
             }
@@ -920,14 +950,17 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
                 exit(1);
             }
 
-            if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
+            if let Err(e) = pageserver
+                .start(&pageserver_config_overrides(subcommand_args))
+                .await
+            {
                 eprintln!("pageserver start failed: {e}");
                 exit(1);
             }
         }
 
         Some(("status", subcommand_args)) => {
-            match get_pageserver(env, subcommand_args)?.check_status() {
+            match get_pageserver(env, subcommand_args)?.check_status().await {
                 Ok(_) => println!("Page server is up and running"),
                 Err(err) => {
                     eprintln!("Page server is not available: {}", err);
@@ -942,11 +975,14 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
     Ok(())
 }
 
-fn handle_attachment_service(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+async fn handle_attachment_service(
+    sub_match: &ArgMatches,
+    env: &local_env::LocalEnv,
+) -> Result<()> {
     let svc = AttachmentService::from_env(env);
     match sub_match.subcommand() {
         Some(("start", _start_match)) => {
-            if let Err(e) = svc.start() {
+            if let Err(e) = svc.start().await {
                 eprintln!("start failed: {e}");
                 exit(1);
             }
@@ -987,7 +1023,7 @@ fn safekeeper_extra_opts(init_match: &ArgMatches) -> Vec<String> {
         .collect()
 }
 
-fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
     let (sub_name, sub_args) = match sub_match.subcommand() {
         Some(safekeeper_command_data) => safekeeper_command_data,
         None => bail!("no safekeeper subcommand provided"),
@@ -1005,7 +1041,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
         "start" => {
             let extra_opts = safekeeper_extra_opts(sub_args);
 
-            if let Err(e) = safekeeper.start(extra_opts) {
+            if let Err(e) = safekeeper.start(extra_opts).await {
                 eprintln!("safekeeper start failed: {}", e);
                 exit(1);
             }
@@ -1031,7 +1067,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
             }
 
             let extra_opts = safekeeper_extra_opts(sub_args);
-            if let Err(e) = safekeeper.start(extra_opts) {
+            if let Err(e) = safekeeper.start(extra_opts).await {
                 eprintln!("safekeeper start failed: {}", e);
                 exit(1);
             }
@@ -1044,15 +1080,15 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
     Ok(())
 }
 
-fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
+async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
     // Endpoints are not started automatically
 
-    broker::start_broker_process(env)?;
+    broker::start_broker_process(env).await?;
 
     // Only start the attachment service if the pageserver is configured to need it
     if env.control_plane_api.is_some() {
         let attachment_service = AttachmentService::from_env(env);
-        if let Err(e) = attachment_service.start() {
+        if let Err(e) = attachment_service.start().await {
             eprintln!("attachment_service start failed: {:#}", e);
             try_stop_all(env, true);
             exit(1);
@@ -1061,7 +1097,10 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow
 
     for ps_conf in &env.pageservers {
         let pageserver = PageServerNode::from_env(env, ps_conf);
-        if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
+        if let Err(e) = pageserver
+            .start(&pageserver_config_overrides(sub_match))
+            .await
+        {
             eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
             try_stop_all(env, true);
             exit(1);
@@ -1070,7 +1109,7 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow
 
     for node in env.safekeepers.iter() {
         let safekeeper = SafekeeperNode::from_env(env, node);
-        if let Err(e) = safekeeper.start(vec![]) {
+        if let Err(e) = safekeeper.start(vec![]).await {
             eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
             try_stop_all(env, false);
             exit(1);
diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs
index 6be865cc2e..f40705863b 100644
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -11,7 +11,7 @@ use camino::Utf8PathBuf;
 
 use crate::{background_process, local_env};
 
-pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
+pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
     let broker = &env.broker;
     let listen_addr = &broker.listen_addr;
 
@@ -19,15 +19,15 @@ pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
 
     let args = [format!("--listen-addr={listen_addr}")];
 
-    let client = reqwest::blocking::Client::new();
+    let client = reqwest::Client::new();
     background_process::start_process(
         "storage_broker",
         &env.base_data_dir,
         &env.storage_broker_bin(),
         args,
         [],
-        background_process::InitialPidFile::Create(&storage_broker_pid_file_path(env)),
-        || {
+        background_process::InitialPidFile::Create(storage_broker_pid_file_path(env)),
+        || async {
             let url = broker.client_url();
             let status_url = url.join("status").with_context(|| {
                 format!("Failed to append /status path to broker endpoint {url}")
@@ -36,12 +36,13 @@ pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
                 .get(status_url)
                 .build()
                 .with_context(|| format!("Failed to construct request to broker endpoint {url}"))?;
-            match client.execute(request) {
+            match client.execute(request).await {
                 Ok(resp) => Ok(resp.status().is_success()),
                 Err(_) => Ok(false),
             }
         },
     )
+    .await
     .context("Failed to spawn storage_broker subprocess")?;
     Ok(())
 }
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index a566f03db9..071f22dc2b 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -464,7 +464,7 @@ impl Endpoint {
         }
     }
 
-    pub fn start(
+    pub async fn start(
         &self,
         auth_token: &Option<String>,
         safekeepers: Vec<NodeId>,
@@ -587,7 +587,7 @@ impl Endpoint {
         const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s
         loop {
             attempt += 1;
-            match self.get_status() {
+            match self.get_status().await {
                 Ok(state) => {
                     match state.status {
                         ComputeStatus::Init => {
@@ -629,8 +629,8 @@ impl Endpoint {
     }
 
     // Call the /status HTTP API
-    pub fn get_status(&self) -> Result<ComputeState> {
-        let client = reqwest::blocking::Client::new();
+    pub async fn get_status(&self) -> Result<ComputeState> {
+        let client = reqwest::Client::new();
 
         let response = client
             .request(
@@ -641,16 +641,17 @@ impl Endpoint {
                     self.http_address.port()
                 ),
             )
-            .send()?;
+            .send()
+            .await?;
 
         // Interpret the response
         let status = response.status();
         if !(status.is_client_error() || status.is_server_error()) {
-            Ok(response.json()?)
+            Ok(response.json().await?)
         } else {
             // reqwest does not export its error construction utility functions, so let's craft the message ourselves
             let url = response.url().to_owned();
-            let msg = match response.text() {
+            let msg = match response.text().await {
                 Ok(err_body) => format!("Error: {}", err_body),
                 Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
             };
@@ -658,7 +659,7 @@ impl Endpoint {
         }
     }
 
-    pub fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
+    pub async fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
         let mut spec: ComputeSpec = {
             let spec_path = self.endpoint_path().join("spec.json");
             let file = std::fs::File::open(spec_path)?;
@@ -687,7 +688,7 @@ impl Endpoint {
             spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
         }
 
-        let client = reqwest::blocking::Client::new();
+        let client = reqwest::Client::new();
         let response = client
             .post(format!(
                 "http://{}:{}/configure",
@@ -698,14 +699,15 @@ impl Endpoint {
                 "{{\"spec\":{}}}",
                 serde_json::to_string_pretty(&spec)?
             ))
-            .send()?;
+            .send()
+            .await?;
 
         let status = response.status();
         if !(status.is_client_error() || status.is_server_error()) {
             Ok(())
         } else {
             let url = response.url().to_owned();
-            let msg = match response.text() {
+            let msg = match response.text().await {
                 Ok(err_body) => format!("Error: {}", err_body),
                 Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
             };
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 981ddd3e98..7d490016bf 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -6,28 +6,24 @@
 //!
 use std::borrow::Cow;
 use std::collections::HashMap;
-use std::fs::File;
-use std::io::{BufReader, Write};
+
+use std::io;
+use std::io::Write;
 use std::num::NonZeroU64;
 use std::path::PathBuf;
 use std::process::{Child, Command};
 use std::time::Duration;
-use std::{io, result};
 
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
-use pageserver_api::models::{
-    self, LocationConfig, TenantInfo, TenantLocationConfigRequest, TimelineInfo,
-};
+use futures::SinkExt;
+use pageserver_api::models::{self, LocationConfig, TenantInfo, TimelineInfo};
 use pageserver_api::shard::TenantShardId;
+use pageserver_client::mgmt_api;
 use postgres_backend::AuthType;
 use postgres_connection::{parse_host_port, PgConnectionConfig};
-use reqwest::blocking::{Client, RequestBuilder, Response};
-use reqwest::{IntoUrl, Method};
-use thiserror::Error;
 use utils::auth::{Claims, Scope};
 use utils::{
-    http::error::HttpErrorBody,
     id::{TenantId, TimelineId},
     lsn::Lsn,
 };
@@ -38,45 +34,6 @@ use crate::{background_process, local_env::LocalEnv};
 /// Directory within .neon which will be used by default for LocalFs remote storage.
 pub const PAGESERVER_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/pageserver";
 
-#[derive(Error, Debug)]
-pub enum PageserverHttpError {
-    #[error("Reqwest error: {0}")]
-    Transport(#[from] reqwest::Error),
-
-    #[error("Error: {0}")]
-    Response(String),
-}
-
-impl From<anyhow::Error> for PageserverHttpError {
-    fn from(e: anyhow::Error) -> Self {
-        Self::Response(e.to_string())
-    }
-}
-
-type Result<T> = result::Result<T, PageserverHttpError>;
-
-pub trait ResponseErrorMessageExt: Sized {
-    fn error_from_body(self) -> Result<Self>;
-}
-
-impl ResponseErrorMessageExt for Response {
-    fn error_from_body(self) -> Result<Self> {
-        let status = self.status();
-        if !(status.is_client_error() || status.is_server_error()) {
-            return Ok(self);
-        }
-
-        // reqwest does not export its error construction utility functions, so let's craft the message ourselves
-        let url = self.url().to_owned();
-        Err(PageserverHttpError::Response(
-            match self.json::<HttpErrorBody>() {
-                Ok(err_body) => format!("Error: {}", err_body.msg),
-                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
-            },
-        ))
-    }
-}
-
 //
 // Control routines for pageserver.
 //
@@ -87,8 +44,7 @@ pub struct PageServerNode {
     pub pg_connection_config: PgConnectionConfig,
     pub conf: PageServerConf,
     pub env: LocalEnv,
-    pub http_client: Client,
-    pub http_base_url: String,
+    pub http_client: mgmt_api::Client,
 }
 
 impl PageServerNode {
@@ -100,8 +56,19 @@ impl PageServerNode {
             pg_connection_config: PgConnectionConfig::new_host_port(host, port),
             conf: conf.clone(),
             env: env.clone(),
-            http_client: Client::new(),
-            http_base_url: format!("http://{}/v1", conf.listen_http_addr),
+            http_client: mgmt_api::Client::new(
+                format!("http://{}", conf.listen_http_addr),
+                {
+                    match conf.http_auth_type {
+                        AuthType::Trust => None,
+                        AuthType::NeonJWT => Some(
+                            env.generate_auth_token(&Claims::new(None, Scope::PageServerApi))
+                                .unwrap(),
+                        ),
+                    }
+                }
+                .as_deref(),
+            ),
         }
     }
 
@@ -182,8 +149,8 @@ impl PageServerNode {
             .expect("non-Unicode path")
     }
 
-    pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
-        self.start_node(config_overrides, false)
+    pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
+        self.start_node(config_overrides, false).await
     }
 
     fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
@@ -224,7 +191,12 @@ impl PageServerNode {
         Ok(())
     }
 
-    fn start_node(&self, config_overrides: &[&str], update_config: bool) -> anyhow::Result<Child> {
+    async fn start_node(
+        &self,
+        config_overrides: &[&str],
+        update_config: bool,
+    ) -> anyhow::Result<Child> {
+        // TODO: using a thread here because start_process() is not async but we need to call check_status()
         let datadir = self.repo_path();
         print!(
             "Starting pageserver node {} at '{}' in {:?}",
@@ -232,7 +204,7 @@ impl PageServerNode {
             self.pg_connection_config.raw_address(),
             datadir
         );
-        io::stdout().flush()?;
+        io::stdout().flush().context("flush stdout")?;
 
         let datadir_path_str = datadir.to_str().with_context(|| {
             format!(
@@ -244,20 +216,23 @@ impl PageServerNode {
         if update_config {
             args.push(Cow::Borrowed("--update-config"));
         }
-
         background_process::start_process(
             "pageserver",
             &datadir,
             &self.env.pageserver_bin(),
             args.iter().map(Cow::as_ref),
             self.pageserver_env_variables()?,
-            background_process::InitialPidFile::Expect(&self.pid_file()),
-            || match self.check_status() {
-                Ok(()) => Ok(true),
-                Err(PageserverHttpError::Transport(_)) => Ok(false),
-                Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
+            background_process::InitialPidFile::Expect(self.pid_file()),
+            || async {
+                let st = self.check_status().await;
+                match st {
+                    Ok(()) => Ok(true),
+                    Err(mgmt_api::Error::ReceiveBody(_)) => Ok(false),
+                    Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
+                }
             },
         )
+        .await
     }
 
     fn pageserver_basic_args<'a>(
@@ -303,7 +278,12 @@ impl PageServerNode {
         background_process::stop_process(immediate, "pageserver", &self.pid_file())
     }
 
-    pub fn page_server_psql_client(&self) -> anyhow::Result<postgres::Client> {
+    pub async fn page_server_psql_client(
+        &self,
+    ) -> anyhow::Result<(
+        tokio_postgres::Client,
+        tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
+    )> {
         let mut config = self.pg_connection_config.clone();
         if self.conf.pg_auth_type == AuthType::NeonJWT {
             let token = self
@@ -311,36 +291,18 @@ impl PageServerNode {
                 .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
             config = config.set_password(Some(token));
         }
-        Ok(config.connect_no_tls()?)
+        Ok(config.connect_no_tls().await?)
     }
 
-    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> anyhow::Result<RequestBuilder> {
-        let mut builder = self.http_client.request(method, url);
-        if self.conf.http_auth_type == AuthType::NeonJWT {
-            let token = self
-                .env
-                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
-            builder = builder.bearer_auth(token)
-        }
-        Ok(builder)
+    pub async fn check_status(&self) -> mgmt_api::Result<()> {
+        self.http_client.status().await
     }
 
-    pub fn check_status(&self) -> Result<()> {
-        self.http_request(Method::GET, format!("{}/status", self.http_base_url))?
-            .send()?
-            .error_from_body()?;
-        Ok(())
+    pub async fn tenant_list(&self) -> mgmt_api::Result<Vec<TenantInfo>> {
+        self.http_client.list_tenants().await
     }
 
-    pub fn tenant_list(&self) -> Result<Vec<TenantInfo>> {
-        Ok(self
-            .http_request(Method::GET, format!("{}/tenant", self.http_base_url))?
-            .send()?
-            .error_from_body()?
-            .json()?)
-    }
-
-    pub fn tenant_create(
+    pub async fn tenant_create(
         &self,
         new_tenant_id: TenantId,
         generation: Option<u32>,
@@ -418,23 +380,10 @@ impl PageServerNode {
         if !settings.is_empty() {
             bail!("Unrecognized tenant settings: {settings:?}")
         }
-        self.http_request(Method::POST, format!("{}/tenant", self.http_base_url))?
-            .json(&request)
-            .send()?
-            .error_from_body()?
-            .json::<Option<String>>()
-            .with_context(|| {
-                format!("Failed to parse tenant creation response for tenant id: {new_tenant_id:?}")
-            })?
-            .context("No tenant id was found in the tenant creation response")
-            .and_then(|tenant_id_string| {
-                tenant_id_string.parse().with_context(|| {
-                    format!("Failed to parse response string as tenant id: '{tenant_id_string}'")
-                })
-            })
+        Ok(self.http_client.tenant_create(&request).await?)
     }
 
-    pub fn tenant_config(
+    pub async fn tenant_config(
         &self,
         tenant_id: TenantId,
         mut settings: HashMap<&str, &str>,
@@ -513,54 +462,30 @@ impl PageServerNode {
             bail!("Unrecognized tenant settings: {settings:?}")
         }
 
-        self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))?
-            .json(&models::TenantConfigRequest { tenant_id, config })
-            .send()?
-            .error_from_body()?;
+        self.http_client
+            .tenant_config(&models::TenantConfigRequest { tenant_id, config })
+            .await?;
 
         Ok(())
     }
 
-    pub fn location_config(
+    pub async fn location_config(
         &self,
         tenant_id: TenantId,
         config: LocationConfig,
         flush_ms: Option<Duration>,
     ) -> anyhow::Result<()> {
-        let req_body = TenantLocationConfigRequest { tenant_id, config };
-
-        let path = format!(
-            "{}/tenant/{}/location_config",
-            self.http_base_url, tenant_id
-        );
-        let path = if let Some(flush_ms) = flush_ms {
-            format!("{}?flush_ms={}", path, flush_ms.as_millis())
-        } else {
-            path
-        };
-
-        self.http_request(Method::PUT, path)?
-            .json(&req_body)
-            .send()?
-            .error_from_body()?;
-
-        Ok(())
+        Ok(self
+            .http_client
+            .location_config(tenant_id, config, flush_ms)
+            .await?)
     }
 
-    pub fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
-        let timeline_infos: Vec<TimelineInfo> = self
-            .http_request(
-                Method::GET,
-                format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
-            )?
-            .send()?
-            .error_from_body()?
-            .json()?;
-
-        Ok(timeline_infos)
+    pub async fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
+        Ok(self.http_client.list_timelines(*tenant_id).await?)
     }
 
-    pub fn timeline_create(
+    pub async fn timeline_create(
         &self,
         tenant_id: TenantId,
         new_timeline_id: Option<TimelineId>,
@@ -571,29 +496,14 @@ impl PageServerNode {
     ) -> anyhow::Result<TimelineInfo> {
         // If timeline ID was not specified, generate one
         let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
-
-        self.http_request(
-            Method::POST,
-            format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
-        )?
-        .json(&models::TimelineCreateRequest {
+        let req = models::TimelineCreateRequest {
             new_timeline_id,
             ancestor_start_lsn,
             ancestor_timeline_id,
             pg_version,
             existing_initdb_timeline_id,
-        })
-        .send()?
-        .error_from_body()?
-        .json::<Option<TimelineInfo>>()
-        .with_context(|| {
-            format!("Failed to parse timeline creation response for tenant id: {tenant_id}")
-        })?
-        .with_context(|| {
-            format!(
-                "No timeline id was found in the timeline creation response for tenant {tenant_id}"
-            )
-        })
+        };
+        Ok(self.http_client.timeline_create(tenant_id, &req).await?)
     }
 
     /// Import a basebackup prepared using either:
@@ -605,7 +515,7 @@ impl PageServerNode {
     /// * `timeline_id` - id to assign to imported timeline
     /// * `base` - (start lsn of basebackup, path to `base.tar` file)
     /// * `pg_wal` - if there's any wal to import: (end lsn, path to `pg_wal.tar`)
-    pub fn timeline_import(
+    pub async fn timeline_import(
         &self,
         tenant_id: TenantId,
         timeline_id: TimelineId,
@@ -613,36 +523,60 @@ impl PageServerNode {
         pg_wal: Option<(Lsn, PathBuf)>,
         pg_version: u32,
     ) -> anyhow::Result<()> {
-        let mut client = self.page_server_psql_client()?;
+        let (client, conn) = self.page_server_psql_client().await?;
+        // The connection object performs the actual communication with the database,
+        // so spawn it off to run on its own.
+        tokio::spawn(async move {
+            if let Err(e) = conn.await {
+                eprintln!("connection error: {}", e);
+            }
+        });
+        tokio::pin!(client);
 
         // Init base reader
         let (start_lsn, base_tarfile_path) = base;
-        let base_tarfile = File::open(base_tarfile_path)?;
-        let mut base_reader = BufReader::new(base_tarfile);
+        let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?;
+        let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile);
 
         // Init wal reader if necessary
         let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
-            let wal_tarfile = File::open(wal_tarfile_path)?;
-            let wal_reader = BufReader::new(wal_tarfile);
+            let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?;
+            let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile);
             (end_lsn, Some(wal_reader))
         } else {
             (start_lsn, None)
         };
 
-        // Import base
-        let import_cmd = format!(
-            "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
-        );
-        let mut writer = client.copy_in(&import_cmd)?;
-        io::copy(&mut base_reader, &mut writer)?;
-        writer.finish()?;
+        let copy_in = |reader, cmd| {
+            let client = &client;
+            async move {
+                let writer = client.copy_in(&cmd).await?;
+                let writer = std::pin::pin!(writer);
+                let mut writer = writer.sink_map_err(|e| {
+                    std::io::Error::new(std::io::ErrorKind::Other, format!("{e}"))
+                });
+                let mut reader = std::pin::pin!(reader);
+                writer.send_all(&mut reader).await?;
+                writer.into_inner().finish().await?;
+                anyhow::Ok(())
+            }
+        };
 
+        // Import base
+        copy_in(
+            base_tarfile,
+            format!(
+                "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
+            ),
+        )
+        .await?;
         // Import wal if necessary
-        if let Some(mut wal_reader) = wal_reader {
-            let import_cmd = format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}");
-            let mut writer = client.copy_in(&import_cmd)?;
-            io::copy(&mut wal_reader, &mut writer)?;
-            writer.finish()?;
+        if let Some(wal_reader) = wal_reader {
+            copy_in(
+                wal_reader,
+                format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"),
+            )
+            .await?;
         }
 
         Ok(())
diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs
index a8baa0ac53..4026ef0eb9 100644
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -13,7 +13,6 @@ use std::{io, result};
 use anyhow::Context;
 use camino::Utf8PathBuf;
 use postgres_connection::PgConnectionConfig;
-use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
 use utils::{http::error::HttpErrorBody, id::NodeId};
@@ -34,12 +33,14 @@ pub enum SafekeeperHttpError {
 
 type Result<T> = result::Result<T, SafekeeperHttpError>;
 
+#[async_trait::async_trait]
 pub trait ResponseErrorMessageExt: Sized {
-    fn error_from_body(self) -> Result<Self>;
+    async fn error_from_body(self) -> Result<Self>;
 }
 
-impl ResponseErrorMessageExt for Response {
-    fn error_from_body(self) -> Result<Self> {
+#[async_trait::async_trait]
+impl ResponseErrorMessageExt for reqwest::Response {
+    async fn error_from_body(self) -> Result<Self> {
         let status = self.status();
         if !(status.is_client_error() || status.is_server_error()) {
             return Ok(self);
@@ -48,7 +49,7 @@ impl ResponseErrorMessageExt for Response {
         // reqwest does not export its error construction utility functions, so let's craft the message ourselves
         let url = self.url().to_owned();
         Err(SafekeeperHttpError::Response(
-            match self.json::<HttpErrorBody>() {
+            match self.json::<HttpErrorBody>().await {
                 Ok(err_body) => format!("Error: {}", err_body.msg),
                 Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
             },
@@ -69,7 +70,7 @@ pub struct SafekeeperNode {
 
     pub pg_connection_config: PgConnectionConfig,
     pub env: LocalEnv,
-    pub http_client: Client,
+    pub http_client: reqwest::Client,
     pub http_base_url: String,
 }
 
@@ -80,7 +81,7 @@ impl SafekeeperNode {
             conf: conf.clone(),
             pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
             env: env.clone(),
-            http_client: Client::new(),
+            http_client: reqwest::Client::new(),
             http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
         }
     }
@@ -103,7 +104,7 @@ impl SafekeeperNode {
             .expect("non-Unicode path")
     }
 
-    pub fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
+    pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
         print!(
             "Starting safekeeper at '{}' in '{}'",
             self.pg_connection_config.raw_address(),
@@ -191,13 +192,16 @@ impl SafekeeperNode {
             &self.env.safekeeper_bin(),
             &args,
             [],
-            background_process::InitialPidFile::Expect(&self.pid_file()),
-            || match self.check_status() {
-                Ok(()) => Ok(true),
-                Err(SafekeeperHttpError::Transport(_)) => Ok(false),
-                Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
+            background_process::InitialPidFile::Expect(self.pid_file()),
+            || async {
+                match self.check_status().await {
+                    Ok(()) => Ok(true),
+                    Err(SafekeeperHttpError::Transport(_)) => Ok(false),
+                    Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
+                }
             },
         )
+        .await
     }
 
     ///
@@ -216,7 +220,7 @@ impl SafekeeperNode {
         )
     }
 
-    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
+    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> reqwest::RequestBuilder {
         // TODO: authentication
         //if self.env.auth_type == AuthType::NeonJWT {
         //    builder = builder.bearer_auth(&self.env.safekeeper_auth_token)
@@ -224,10 +228,12 @@ impl SafekeeperNode {
         self.http_client.request(method, url)
     }
 
-    pub fn check_status(&self) -> Result<()> {
+    pub async fn check_status(&self) -> Result<()> {
         self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status"))
-            .send()?
-            .error_from_body()?;
+            .send()
+            .await?
+            .error_from_body()
+            .await?;
         Ok(())
     }
 }
diff --git a/control_plane/src/tenant_migration.rs b/control_plane/src/tenant_migration.rs
index fbb0358158..79df108896 100644
--- a/control_plane/src/tenant_migration.rs
+++ b/control_plane/src/tenant_migration.rs
@@ -19,11 +19,11 @@ use utils::{
 };
 
 /// Given an attached pageserver, retrieve the LSN for all timelines
-fn get_lsns(
+async fn get_lsns(
     tenant_id: TenantId,
     pageserver: &PageServerNode,
 ) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
-    let timelines = pageserver.timeline_list(&tenant_id)?;
+    let timelines = pageserver.timeline_list(&tenant_id).await?;
     Ok(timelines
         .into_iter()
         .map(|t| (t.timeline_id, t.last_record_lsn))
@@ -32,13 +32,13 @@ fn get_lsns(
 
 /// Wait for the timeline LSNs on `pageserver` to catch up with or overtake
 /// `baseline`.
-fn await_lsn(
+async fn await_lsn(
     tenant_id: TenantId,
     pageserver: &PageServerNode,
     baseline: HashMap<TimelineId, Lsn>,
 ) -> anyhow::Result<()> {
     loop {
-        let latest = match get_lsns(tenant_id, pageserver) {
+        let latest = match get_lsns(tenant_id, pageserver).await {
             Ok(l) => l,
             Err(e) => {
                 println!(
@@ -84,7 +84,7 @@ fn await_lsn(
 ///  - Coordinate attach/secondary/detach on pageservers
 ///  - call into attachment_service for generations
 ///  - reconfigure compute endpoints to point to new attached pageserver
-pub fn migrate_tenant(
+pub async fn migrate_tenant(
     env: &LocalEnv,
     tenant_id: TenantId,
     dest_ps: PageServerNode,
@@ -108,16 +108,18 @@ pub fn migrate_tenant(
         }
     }
 
-    let previous = attachment_service.inspect(tenant_id)?;
+    let previous = attachment_service.inspect(tenant_id).await?;
     let mut baseline_lsns = None;
     if let Some((generation, origin_ps_id)) = &previous {
         let origin_ps = PageServerNode::from_env(env, env.get_pageserver_conf(*origin_ps_id)?);
 
         if origin_ps_id == &dest_ps.conf.id {
             println!("🔁 Already attached to {origin_ps_id}, freshening...");
-            let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
+            let gen = attachment_service
+                .attach_hook(tenant_id, dest_ps.conf.id)
+                .await?;
             let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
-            dest_ps.location_config(tenant_id, dest_conf, None)?;
+            dest_ps.location_config(tenant_id, dest_conf, None).await?;
             println!("✅ Migration complete");
             return Ok(());
         }
@@ -126,20 +128,24 @@ pub fn migrate_tenant(
 
         let stale_conf =
             build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None);
-        origin_ps.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))?;
+        origin_ps
+            .location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))
+            .await?;
 
-        baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?);
+        baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?);
     }
 
-    let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
+    let gen = attachment_service
+        .attach_hook(tenant_id, dest_ps.conf.id)
+        .await?;
     let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None);
 
     println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
-    dest_ps.location_config(tenant_id, dest_conf, None)?;
+    dest_ps.location_config(tenant_id, dest_conf, None).await?;
 
     if let Some(baseline) = baseline_lsns {
         println!("🕑 Waiting for LSN to catch up...");
-        await_lsn(tenant_id, &dest_ps, baseline)?;
+        await_lsn(tenant_id, &dest_ps, baseline).await?;
     }
 
     let cplane = ComputeControlPlane::load(env.clone())?;
@@ -149,7 +155,7 @@ pub fn migrate_tenant(
                 "🔁 Reconfiguring endpoint {} to use pageserver {}",
                 endpoint_name, dest_ps.conf.id
             );
-            endpoint.reconfigure(Some(dest_ps.conf.id))?;
+            endpoint.reconfigure(Some(dest_ps.conf.id)).await?;
         }
     }
 
@@ -159,7 +165,7 @@ pub fn migrate_tenant(
         }
 
         let other_ps = PageServerNode::from_env(env, other_ps_conf);
-        let other_ps_tenants = other_ps.tenant_list()?;
+        let other_ps_tenants = other_ps.tenant_list().await?;
 
         // Check if this tenant is attached
         let found = other_ps_tenants
@@ -181,7 +187,9 @@ pub fn migrate_tenant(
             "💤 Switching to secondary mode on pageserver {}",
             other_ps.conf.id
         );
-        other_ps.location_config(tenant_id, secondary_conf, None)?;
+        other_ps
+            .location_config(tenant_id, secondary_conf, None)
+            .await?;
     }
 
     println!(
@@ -189,7 +197,7 @@ pub fn migrate_tenant(
         dest_ps.conf.id
     );
     let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
-    dest_ps.location_config(tenant_id, dest_conf, None)?;
+    dest_ps.location_config(tenant_id, dest_conf, None).await?;
 
     println!("✅ Migration complete");
 
diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs
index 35cb1a2691..ccf9108895 100644
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -163,8 +163,18 @@ impl PgConnectionConfig {
     }
 
     /// Connect using postgres protocol with TLS disabled.
-    pub fn connect_no_tls(&self) -> Result<postgres::Client, postgres::Error> {
-        postgres::Config::from(self.to_tokio_postgres_config()).connect(postgres::NoTls)
+    pub async fn connect_no_tls(
+        &self,
+    ) -> Result<
+        (
+            tokio_postgres::Client,
+            tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
+        ),
+        postgres::Error,
+    > {
+        self.to_tokio_postgres_config()
+            .connect(postgres::NoTls)
+            .await
     }
 }
 
diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml
new file mode 100644
index 0000000000..4bd36185a6
--- /dev/null
+++ b/pageserver/client/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "pageserver_client"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+pageserver_api.workspace = true
+thiserror.workspace = true
+async-trait.workspace = true
+reqwest.workspace = true
+utils.workspace = true
+serde.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/pageserver/client/src/lib.rs b/pageserver/client/src/lib.rs
new file mode 100644
index 0000000000..3963fd466c
--- /dev/null
+++ b/pageserver/client/src/lib.rs
@@ -0,0 +1 @@
+pub mod mgmt_api;
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
new file mode 100644
index 0000000000..821c238082
--- /dev/null
+++ b/pageserver/client/src/mgmt_api.rs
@@ -0,0 +1,172 @@
+use pageserver_api::models::*;
+use reqwest::{IntoUrl, Method};
+use utils::{
+    http::error::HttpErrorBody,
+    id::{TenantId, TimelineId},
+};
+
+#[derive(Debug)]
+pub struct Client {
+    mgmt_api_endpoint: String,
+    authorization_header: Option<String>,
+    client: reqwest::Client,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum Error {
+    #[error("receive body: {0}")]
+    ReceiveBody(reqwest::Error),
+
+    #[error("receive error body: {0}")]
+    ReceiveErrorBody(String),
+
+    #[error("pageserver API: {0}")]
+    ApiError(String),
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+#[async_trait::async_trait]
+pub trait ResponseErrorMessageExt: Sized {
+    async fn error_from_body(self) -> Result<Self>;
+}
+
+#[async_trait::async_trait]
+impl ResponseErrorMessageExt for reqwest::Response {
+    async fn error_from_body(mut self) -> Result<Self> {
+        let status = self.status();
+        if !(status.is_client_error() || status.is_server_error()) {
+            return Ok(self);
+        }
+
+        let url = self.url().to_owned();
+        Err(match self.json::<HttpErrorBody>().await {
+            Ok(HttpErrorBody { msg }) => Error::ApiError(msg),
+            Err(_) => {
+                Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), url))
+            }
+        })
+    }
+}
+
+impl Client {
+    pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
+        Self {
+            mgmt_api_endpoint,
+            authorization_header: jwt.map(|jwt| format!("Bearer {jwt}")),
+            client: reqwest::Client::new(),
+        }
+    }
+
+    pub async fn list_tenants(&self) -> Result<Vec<pageserver_api::models::TenantInfo>> {
+        let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint);
+        let resp = self.get(&uri).await?;
+        resp.json().await.map_err(Error::ReceiveBody)
+    }
+
+    pub async fn list_timelines(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<Vec<pageserver_api::models::TimelineInfo>> {
+        let uri = format!("{}/v1/tenant/{tenant_id}/timeline", self.mgmt_api_endpoint);
+        self.get(&uri)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    pub async fn timeline_info(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<pageserver_api::models::TimelineInfo> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
+            self.mgmt_api_endpoint
+        );
+        self.get(&uri)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    async fn get<U: IntoUrl>(&self, uri: U) -> Result<reqwest::Response> {
+        self.request(Method::GET, uri, ()).await
+    }
+
+    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
+        &self,
+        method: Method,
+        uri: U,
+        body: B,
+    ) -> Result<reqwest::Response> {
+        let req = self.client.request(method, uri);
+        let req = if let Some(value) = &self.authorization_header {
+            req.header(reqwest::header::AUTHORIZATION, value)
+        } else {
+            req
+        };
+        let res = req.json(&body).send().await.map_err(Error::ReceiveBody)?;
+        let response = res.error_from_body().await?;
+        Ok(response)
+    }
+
+    pub async fn status(&self) -> Result<()> {
+        let uri = format!("{}/v1/status", self.mgmt_api_endpoint);
+        self.get(&uri).await?;
+        Ok(())
+    }
+
+    pub async fn tenant_create(&self, req: &TenantCreateRequest) -> Result<TenantId> {
+        let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint);
+        self.request(Method::POST, &uri, req)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
+        let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
+        self.request(Method::PUT, &uri, req).await?;
+        Ok(())
+    }
+
+    pub async fn location_config(
+        &self,
+        tenant_id: TenantId,
+        config: LocationConfig,
+        flush_ms: Option<std::time::Duration>,
+    ) -> Result<()> {
+        let req_body = TenantLocationConfigRequest { tenant_id, config };
+        let path = format!(
+            "{}/v1/tenant/{}/location_config",
+            self.mgmt_api_endpoint, tenant_id
+        );
+        let path = if let Some(flush_ms) = flush_ms {
+            format!("{}?flush_ms={}", path, flush_ms.as_millis())
+        } else {
+            path
+        };
+        self.request(Method::PUT, &path, &req_body).await?;
+        Ok(())
+    }
+
+    pub async fn timeline_create(
+        &self,
+        tenant_id: TenantId,
+        req: &TimelineCreateRequest,
+    ) -> Result<TimelineInfo> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline",
+            self.mgmt_api_endpoint, tenant_id
+        );
+        self.request(Method::POST, &uri, req)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+}

From 56f7d55ba7f1aaab6f997754b29c27f0a16dff94 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Dec 2023 17:43:02 +0000
Subject: [PATCH 10/19] pageserver: basic cancel/timeout for remote storage
 operations (#6097)

## Problem

Various places in remote storage were not subject to a timeout (thereby
stuck TCP connections could hold things up), and did not respect a
cancellation token (so things like timeline deletion or tenant detach
would have to wait arbitrarily long).



## Summary of changes

- Add download_cancellable and upload_cancellable helpers, and use them
in all the places we wait for remote storage operations (with the
exception of initdb downloads, where it would not have been safe).
- Add a cancellation token arg to `download_retry`.
- Use cancellation token args in various places that were missing one
per #5066

Closes: #5066

Why is this only "basic" handling?
- Doesn't express difference between shutdown and errors in return
types, to avoid refactoring all the places that use an anyhow::Error
(these should all eventually return a more structured error type)
- Implements timeouts on top of remote storage, rather than within it:
this means that operations hitting their timeout will lose their
semaphore permit and thereby go to the back of the queue for their
retry.
- Doing a nicer job is tracked in
https://github.com/neondatabase/neon/issues/6096
---
 libs/utils/src/timeout.rs                     |  3 +
 pageserver/src/tenant.rs                      |  6 +-
 pageserver/src/tenant/delete.rs               | 30 +++++--
 .../src/tenant/remote_timeline_client.rs      | 53 +++++++++++-
 .../tenant/remote_timeline_client/download.rs | 86 ++++++++++++++-----
 .../tenant/remote_timeline_client/upload.rs   | 30 ++++---
 pageserver/src/tenant/storage_layer/layer.rs  |  1 +
 .../fixtures/pageserver/allowed_errors.py     |  3 +
 8 files changed, 166 insertions(+), 46 deletions(-)

diff --git a/libs/utils/src/timeout.rs b/libs/utils/src/timeout.rs
index 11fa417242..56bf57a900 100644
--- a/libs/utils/src/timeout.rs
+++ b/libs/utils/src/timeout.rs
@@ -2,8 +2,11 @@ use std::time::Duration;
 
 use tokio_util::sync::CancellationToken;
 
+#[derive(thiserror::Error, Debug)]
 pub enum TimeoutCancellableError {
+    #[error("Timed out")]
     Timeout,
+    #[error("Cancelled")]
     Cancelled,
 }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ace20e44e2..0d1be33331 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3059,6 +3059,7 @@ impl Tenant {
                     storage,
                     &self.tenant_shard_id,
                     &existing_initdb_timeline_id,
+                    &self.cancel,
                 )
                 .await
                 .context("download initdb tar")?;
@@ -3099,6 +3100,7 @@ impl Tenant {
                             &timeline_id,
                             pgdata_zstd.try_clone().await?,
                             tar_zst_size,
+                            &self.cancel,
                         )
                         .await
                     },
@@ -3106,9 +3108,7 @@ impl Tenant {
                     3,
                     u32::MAX,
                     "persist_initdb_tar_zst",
-                    backoff::Cancel::new(self.cancel.clone(), || {
-                        anyhow::anyhow!("initdb upload cancelled")
-                    }),
+                    backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
                 )
                 .await?;
 
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index acd311ace6..e8491f26db 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -71,6 +71,7 @@ async fn create_remote_delete_mark(
     conf: &PageServerConf,
     remote_storage: &GenericRemoteStorage,
     tenant_shard_id: &TenantShardId,
+    cancel: &CancellationToken,
 ) -> Result<(), DeleteTenantError> {
     let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
 
@@ -87,8 +88,7 @@ async fn create_remote_delete_mark(
         FAILED_UPLOAD_WARN_THRESHOLD,
         FAILED_REMOTE_OP_RETRIES,
         "mark_upload",
-        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-        backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+        backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
     )
     .await
     .context("mark_upload")?;
@@ -170,6 +170,7 @@ async fn remove_tenant_remote_delete_mark(
     conf: &PageServerConf,
     remote_storage: Option<&GenericRemoteStorage>,
     tenant_shard_id: &TenantShardId,
+    cancel: &CancellationToken,
 ) -> Result<(), DeleteTenantError> {
     if let Some(remote_storage) = remote_storage {
         let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
@@ -179,8 +180,7 @@ async fn remove_tenant_remote_delete_mark(
             FAILED_UPLOAD_WARN_THRESHOLD,
             FAILED_REMOTE_OP_RETRIES,
             "remove_tenant_remote_delete_mark",
-            // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+            backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
         )
         .await
         .context("remove_tenant_remote_delete_mark")?;
@@ -322,9 +322,15 @@ impl DeleteTenantFlow {
         // Though sounds scary, different mark name?
         // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
         if let Some(remote_storage) = &remote_storage {
-            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id)
-                .await
-                .context("remote_mark")?
+            create_remote_delete_mark(
+                conf,
+                remote_storage,
+                &tenant.tenant_shard_id,
+                // Can't use tenant.cancel, it's already shut down.  TODO: wire in an appropriate token
+                &CancellationToken::new(),
+            )
+            .await
+            .context("remote_mark")?
         }
 
         fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
@@ -524,8 +530,14 @@ impl DeleteTenantFlow {
                 .context("timelines dir not empty")?;
         }
 
-        remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_shard_id)
-            .await?;
+        remove_tenant_remote_delete_mark(
+            conf,
+            remote_storage.as_ref(),
+            &tenant.tenant_shard_id,
+            // Can't use tenant.cancel, it's already shut down.  TODO: wire in an appropriate token
+            &CancellationToken::new(),
+        )
+        .await?;
 
         fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
             Err(anyhow::anyhow!(
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 4b271a7395..52ee8f49ce 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -196,10 +196,12 @@ pub(crate) use upload::upload_initdb_dir;
 use utils::backoff::{
     self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
+use utils::timeout::{timeout_cancellable, TimeoutCancellableError};
 
 use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
+use std::time::Duration;
 
 use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use std::ops::DerefMut;
@@ -316,6 +318,47 @@ pub struct RemoteTimelineClient {
     storage_impl: GenericRemoteStorage,
 
     deletion_queue_client: DeletionQueueClient,
+
+    cancel: CancellationToken,
+}
+
+/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows.  It is not
+/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that.
+const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120);
+const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120);
+
+/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to anyhow.
+///
+/// This is a convenience for the various upload functions.  In future
+/// the anyhow::Error result should be replaced with a more structured type that
+/// enables callers to avoid handling shutdown as an error.
+async fn upload_cancellable<F>(cancel: &CancellationToken, future: F) -> anyhow::Result<()>
+where
+    F: std::future::Future<Output = anyhow::Result<()>>,
+{
+    match timeout_cancellable(UPLOAD_TIMEOUT, cancel, future).await {
+        Ok(Ok(())) => Ok(()),
+        Ok(Err(e)) => Err(e),
+        Err(TimeoutCancellableError::Timeout) => Err(anyhow::anyhow!("Timeout")),
+        Err(TimeoutCancellableError::Cancelled) => Err(anyhow::anyhow!("Shutting down")),
+    }
+}
+/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to DownloaDError.
+async fn download_cancellable<F, R>(
+    cancel: &CancellationToken,
+    future: F,
+) -> Result<R, DownloadError>
+where
+    F: std::future::Future<Output = Result<R, DownloadError>>,
+{
+    match timeout_cancellable(DOWNLOAD_TIMEOUT, cancel, future).await {
+        Ok(Ok(r)) => Ok(r),
+        Ok(Err(e)) => Err(e),
+        Err(TimeoutCancellableError::Timeout) => {
+            Err(DownloadError::Other(anyhow::anyhow!("Timed out")))
+        }
+        Err(TimeoutCancellableError::Cancelled) => Err(DownloadError::Cancelled),
+    }
 }
 
 impl RemoteTimelineClient {
@@ -351,6 +394,7 @@ impl RemoteTimelineClient {
                 &tenant_shard_id,
                 &timeline_id,
             )),
+            cancel: CancellationToken::new(),
         }
     }
 
@@ -501,6 +545,7 @@ impl RemoteTimelineClient {
         &self,
         layer_file_name: &LayerFileName,
         layer_metadata: &LayerFileMetadata,
+        cancel: &CancellationToken,
     ) -> anyhow::Result<u64> {
         let downloaded_size = {
             let _unfinished_gauge_guard = self.metrics.call_begin(
@@ -517,6 +562,7 @@ impl RemoteTimelineClient {
                 self.timeline_id,
                 layer_file_name,
                 layer_metadata,
+                cancel,
             )
             .measure_remote_op(
                 self.tenant_shard_id.tenant_id,
@@ -971,6 +1017,7 @@ impl RemoteTimelineClient {
                     &self.timeline_id,
                     self.generation,
                     &index_part_with_deleted_at,
+                    &self.cancel,
                 )
             },
             |_e| false,
@@ -980,8 +1027,7 @@ impl RemoteTimelineClient {
             // when executed as part of tenant deletion this happens in the background
             2,
             "persist_index_part_with_deleted_flag",
-            // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+            backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
         )
         .await?;
 
@@ -1281,6 +1327,7 @@ impl RemoteTimelineClient {
                         path,
                         layer_metadata,
                         self.generation,
+                        &self.cancel,
                     )
                     .measure_remote_op(
                         self.tenant_shard_id.tenant_id,
@@ -1307,6 +1354,7 @@ impl RemoteTimelineClient {
                         &self.timeline_id,
                         self.generation,
                         index_part,
+                        &self.cancel,
                     )
                     .measure_remote_op(
                         self.tenant_shard_id.tenant_id,
@@ -1828,6 +1876,7 @@ mod tests {
                     &self.harness.tenant_shard_id,
                     &TIMELINE_ID,
                 )),
+                cancel: CancellationToken::new(),
             })
         }
 
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index ed32c4eed9..d3956163c8 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -5,7 +5,6 @@
 
 use std::collections::HashSet;
 use std::future::Future;
-use std::time::Duration;
 
 use anyhow::{anyhow, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -14,13 +13,17 @@ use tokio::fs::{self, File, OpenOptions};
 use tokio::io::{AsyncSeekExt, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
 use tracing::warn;
+use utils::timeout::timeout_cancellable;
 use utils::{backoff, crashsafe};
 
 use crate::config::PageServerConf;
-use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
+use crate::tenant::remote_timeline_client::{
+    download_cancellable, remote_layer_path, remote_timelines_path, DOWNLOAD_TIMEOUT,
+};
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::Generation;
+use crate::virtual_file::on_fatal_io_error;
 use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
 use utils::crashsafe::path_with_suffix_extension;
@@ -32,8 +35,6 @@ use super::{
     FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
 };
 
-static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
-
 ///
 /// If 'metadata' is given, we will validate that the downloaded file's size matches that
 /// in the metadata. (In the future, we might do more cross-checks, like CRC validation)
@@ -46,6 +47,7 @@ pub async fn download_layer_file<'a>(
     timeline_id: TimelineId,
     layer_file_name: &'a LayerFileName,
     layer_metadata: &'a LayerFileMetadata,
+    cancel: &CancellationToken,
 ) -> Result<u64, DownloadError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
@@ -73,14 +75,18 @@ pub async fn download_layer_file<'a>(
     // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
     let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
 
+    let cancel_inner = cancel.clone();
     let (mut destination_file, bytes_amount) = download_retry(
         || async {
             let destination_file = tokio::fs::File::create(&temp_file_path)
                 .await
                 .with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
                 .map_err(DownloadError::Other)?;
-            let download = storage
-                .download(&remote_path)
+
+            // Cancellation safety: it is safe to cancel this future, because it isn't writing to a local
+            // file: the write to local file doesn't start until after the request header is returned
+            // and we start draining the body stream below
+            let download = download_cancellable(&cancel_inner, storage.download(&remote_path))
                 .await
                 .with_context(|| {
                     format!(
@@ -94,12 +100,33 @@ pub async fn download_layer_file<'a>(
 
             let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
 
-            let bytes_amount = tokio::time::timeout(
-                MAX_DOWNLOAD_DURATION,
+            // Cancellation safety: it is safe to cancel this future because it is writing into a temporary file,
+            // and we will unlink the temporary file if there is an error.  This unlink is important because we
+            // are in a retry loop, and we wouldn't want to leave behind a rogue write I/O to a file that
+            // we will imminiently try and write to again.
+            let bytes_amount: u64 = match timeout_cancellable(
+                DOWNLOAD_TIMEOUT,
+                &cancel_inner,
                 tokio::io::copy_buf(&mut reader, &mut destination_file),
             )
             .await
-            .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out  {:?}", e)))?
+            .with_context(|| {
+                format!(
+                    "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
+                )
+            })
+            .map_err(DownloadError::Other)?
+            {
+                Ok(b) => Ok(b),
+                Err(e) => {
+                    // Remove incomplete files: on restart Timeline would do this anyway, but we must
+                    // do it here for the retry case.
+                    if let Err(e) = tokio::fs::remove_file(&temp_file_path).await {
+                        on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
+                    }
+                    Err(e)
+                }
+            }
             .with_context(|| {
                 format!(
                     "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
@@ -112,6 +139,7 @@ pub async fn download_layer_file<'a>(
             Ok((destination_file, bytes_amount))
         },
         &format!("download {remote_path:?}"),
+        cancel,
     )
     .await?;
 
@@ -188,8 +216,14 @@ pub async fn list_remote_timelines(
         anyhow::bail!("storage-sync-list-remote-timelines");
     });
 
+    let cancel_inner = cancel.clone();
     let listing = download_retry_forever(
-        || storage.list(Some(&remote_path), ListingMode::WithDelimiter),
+        || {
+            download_cancellable(
+                &cancel_inner,
+                storage.list(Some(&remote_path), ListingMode::WithDelimiter),
+            )
+        },
         &format!("list timelines for {tenant_shard_id}"),
         cancel,
     )
@@ -230,9 +264,13 @@ async fn do_download_index_part(
 
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
 
+    let cancel_inner = cancel.clone();
     let index_part_bytes = download_retry_forever(
         || async {
-            let index_part_download = storage.download(&remote_path).await?;
+            // Cancellation: if is safe to cancel this future because we're just downloading into
+            // a memory buffer, not touching local disk.
+            let index_part_download =
+                download_cancellable(&cancel_inner, storage.download(&remote_path)).await?;
 
             let mut index_part_bytes = Vec::new();
             let mut stream = std::pin::pin!(index_part_download.download_stream);
@@ -347,10 +385,7 @@ pub(super) async fn download_index_part(
         FAILED_DOWNLOAD_WARN_THRESHOLD,
         FAILED_REMOTE_OP_RETRIES,
         "listing index_part files",
-        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-        backoff::Cancel::new(CancellationToken::new(), || -> anyhow::Error {
-            unreachable!()
-        }),
+        backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
     )
     .await
     .map_err(DownloadError::Other)?;
@@ -389,6 +424,7 @@ pub(crate) async fn download_initdb_tar_zst(
     storage: &GenericRemoteStorage,
     tenant_shard_id: &TenantShardId,
     timeline_id: &TimelineId,
+    cancel: &CancellationToken,
 ) -> Result<(Utf8PathBuf, File), DownloadError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
@@ -406,6 +442,8 @@ pub(crate) async fn download_initdb_tar_zst(
         "{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}"
     ));
 
+    let cancel_inner = cancel.clone();
+
     let file = download_retry(
         || async {
             let file = OpenOptions::new()
@@ -418,10 +456,14 @@ pub(crate) async fn download_initdb_tar_zst(
                 .with_context(|| format!("tempfile creation {temp_path}"))
                 .map_err(DownloadError::Other)?;
 
-            let download = storage.download(&remote_path).await?;
+            let download =
+                download_cancellable(&cancel_inner, storage.download(&remote_path)).await?;
             let mut download = tokio_util::io::StreamReader::new(download.download_stream);
             let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);
 
+            // TODO: this consumption of the response body should be subject to timeout + cancellation, but
+            // not without thinking carefully about how to recover safely from cancelling a write to
+            // local storage (e.g. by writing into a temp file as we do in download_layer)
             tokio::io::copy_buf(&mut download, &mut writer)
                 .await
                 .with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
@@ -437,6 +479,7 @@ pub(crate) async fn download_initdb_tar_zst(
             Ok(file)
         },
         &format!("download {remote_path}"),
+        cancel,
     )
     .await
     .map_err(|e| {
@@ -460,7 +503,11 @@ pub(crate) async fn download_initdb_tar_zst(
 /// with backoff.
 ///
 /// (See similar logic for uploads in `perform_upload_task`)
-async fn download_retry<T, O, F>(op: O, description: &str) -> Result<T, DownloadError>
+async fn download_retry<T, O, F>(
+    op: O,
+    description: &str,
+    cancel: &CancellationToken,
+) -> Result<T, DownloadError>
 where
     O: FnMut() -> F,
     F: Future<Output = Result<T, DownloadError>>,
@@ -471,10 +518,7 @@ where
         FAILED_DOWNLOAD_WARN_THRESHOLD,
         FAILED_REMOTE_OP_RETRIES,
         description,
-        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-        backoff::Cancel::new(CancellationToken::new(), || -> DownloadError {
-            unreachable!()
-        }),
+        backoff::Cancel::new(cancel.clone(), || DownloadError::Cancelled),
     )
     .await
 }
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index 9f5b3c242b..11c6956875 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -7,12 +7,14 @@ use pageserver_api::shard::TenantShardId;
 use std::io::{ErrorKind, SeekFrom};
 use tokio::fs::{self, File};
 use tokio::io::AsyncSeekExt;
+use tokio_util::sync::CancellationToken;
 
 use super::Generation;
 use crate::{
     config::PageServerConf,
     tenant::remote_timeline_client::{
         index::IndexPart, remote_index_path, remote_initdb_archive_path, remote_path,
+        upload_cancellable,
     },
 };
 use remote_storage::GenericRemoteStorage;
@@ -29,6 +31,7 @@ pub(super) async fn upload_index_part<'a>(
     timeline_id: &TimelineId,
     generation: Generation,
     index_part: &'a IndexPart,
+    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
     tracing::trace!("uploading new index part");
 
@@ -44,14 +47,16 @@ pub(super) async fn upload_index_part<'a>(
     let index_part_bytes = bytes::Bytes::from(index_part_bytes);
 
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
-    storage
-        .upload_storage_object(
+    upload_cancellable(
+        cancel,
+        storage.upload_storage_object(
             futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
             index_part_size,
             &remote_path,
-        )
-        .await
-        .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
+        ),
+    )
+    .await
+    .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
 }
 
 /// Attempts to upload given layer files.
@@ -64,6 +69,7 @@ pub(super) async fn upload_timeline_layer<'a>(
     source_path: &'a Utf8Path,
     known_metadata: &'a LayerFileMetadata,
     generation: Generation,
+    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
     fail_point!("before-upload-layer", |_| {
         bail!("failpoint before-upload-layer")
@@ -107,8 +113,7 @@ pub(super) async fn upload_timeline_layer<'a>(
 
     let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
 
-    storage
-        .upload(reader, fs_size, &storage_path, None)
+    upload_cancellable(cancel, storage.upload(reader, fs_size, &storage_path, None))
         .await
         .with_context(|| format!("upload layer from local path '{source_path}'"))?;
 
@@ -122,6 +127,7 @@ pub(crate) async fn upload_initdb_dir(
     timeline_id: &TimelineId,
     mut initdb_tar_zst: File,
     size: u64,
+    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
     tracing::trace!("uploading initdb dir");
 
@@ -131,8 +137,10 @@ pub(crate) async fn upload_initdb_dir(
     let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE);
 
     let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);
-    storage
-        .upload_storage_object(file, size as usize, &remote_path)
-        .await
-        .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
+    upload_cancellable(
+        cancel,
+        storage.upload_storage_object(file, size as usize, &remote_path),
+    )
+    .await
+    .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
 }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 69a2893456..a4b102c314 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -862,6 +862,7 @@ impl LayerInner {
                 let result = client.download_layer_file(
                     &this.desc.filename(),
                     &this.metadata(),
+                    &crate::task_mgr::shutdown_token()
                 )
                 .await;
 
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 5c3ae3ce4b..74c6bddf23 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -79,6 +79,9 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     # AWS S3 may emit 500 errors for keys in a DeleteObjects response: we retry these
     # and it is not a failure of our code when it happens.
     ".*DeleteObjects.*We encountered an internal error. Please try again.*",
+    # During shutdown, DownloadError::Cancelled may be logged as an error.  Cleaning this
+    # up is tracked in https://github.com/neondatabase/neon/issues/6096
+    ".*Cancelled, shutting down.*",
 )
 
 

From d066dad84b076daf3781cdf9a692098889d3974e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 15 Dec 2023 20:37:47 +0000
Subject: [PATCH 11/19] pageserver: prioritize activation of tenants with
 client requests (#6112)

## Problem

During startup, a client request might have to wait a long time while
the system is busy initializing all the attached tenants, even though
most of the attached tenants probably don't have any client requests to
service, and could wait a bit.

## Summary of changes

- Add a semaphore to limit how many Tenant::spawn()s may concurrently do
I/O to attach their tenant (i.e. read indices from remote storage, scan
local layer files, etc).
- Add Tenant::activate_now, a hook for kicking a tenant in its spawn()
method to skip waiting for the warmup semaphore
- For tenants that attached via warmup semaphore units, wait for logical
size calculation to complete before dropping the warmup units
- Set Tenant::activate_now in `get_active_tenant_with_timeout` (the page
service's path for getting a reference to a tenant).
- Wait for tenant activation in HTTP handlers for timeline creation and
deletion: like page service requests, these require an active tenant and
should prioritize activation if called.
---
 pageserver/src/config.rs                      |  36 +++++
 pageserver/src/http/routes.rs                 |  47 +++++-
 pageserver/src/metrics.rs                     |  51 ++++++-
 pageserver/src/tenant.rs                      | 142 ++++++++++++++++--
 pageserver/src/tenant/mgr.rs                  |  52 ++-----
 pageserver/src/tenant/timeline.rs             |  32 ++++
 .../src/tenant/timeline/logical_size.rs       |   5 +
 test_runner/regress/test_timeline_size.py     | 142 +++++++++++++++++-
 8 files changed, 450 insertions(+), 57 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index cd99cda783..bd63c4d860 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -41,6 +41,8 @@ use crate::{
     TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
 };
 
+use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
+
 pub mod defaults {
     use crate::tenant::config::defaults::*;
     use const_format::formatcp;
@@ -61,6 +63,8 @@ pub mod defaults {
 
     pub const DEFAULT_LOG_FORMAT: &str = "plain";
 
+    pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8;
+
     pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
         super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
 
@@ -94,6 +98,7 @@ pub mod defaults {
 #log_format = '{DEFAULT_LOG_FORMAT}'
 
 #concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
+#concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}'
 
 #metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
 #cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
@@ -180,6 +185,11 @@ pub struct PageServerConf {
 
     pub log_format: LogFormat,
 
+    /// Number of tenants which will be concurrently loaded from remote storage proactively on startup,
+    /// does not limit tenants loaded in response to client I/O.  A lower value implicitly deprioritizes
+    /// loading such tenants, vs. other work in the system.
+    pub concurrent_tenant_warmup: ConfigurableSemaphore,
+
     /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
     pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
     /// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
@@ -283,6 +293,7 @@ struct PageServerConfigBuilder {
 
     log_format: BuilderValue<LogFormat>,
 
+    concurrent_tenant_warmup: BuilderValue<NonZeroUsize>,
     concurrent_tenant_size_logical_size_queries: BuilderValue<NonZeroUsize>,
 
     metric_collection_interval: BuilderValue<Duration>,
@@ -340,6 +351,8 @@ impl Default for PageServerConfigBuilder {
             .expect("cannot parse default keepalive interval")),
             log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
 
+            concurrent_tenant_warmup: Set(NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
+                .expect("Invalid default constant")),
             concurrent_tenant_size_logical_size_queries: Set(
                 ConfigurableSemaphore::DEFAULT_INITIAL,
             ),
@@ -453,6 +466,10 @@ impl PageServerConfigBuilder {
         self.log_format = BuilderValue::Set(log_format)
     }
 
+    pub fn concurrent_tenant_warmup(&mut self, u: NonZeroUsize) {
+        self.concurrent_tenant_warmup = BuilderValue::Set(u);
+    }
+
     pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: NonZeroUsize) {
         self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
     }
@@ -518,6 +535,9 @@ impl PageServerConfigBuilder {
     }
 
     pub fn build(self) -> anyhow::Result<PageServerConf> {
+        let concurrent_tenant_warmup = self
+            .concurrent_tenant_warmup
+            .ok_or(anyhow!("missing concurrent_tenant_warmup"))?;
         let concurrent_tenant_size_logical_size_queries = self
             .concurrent_tenant_size_logical_size_queries
             .ok_or(anyhow!(
@@ -570,6 +590,7 @@ impl PageServerConfigBuilder {
                 .broker_keepalive_interval
                 .ok_or(anyhow!("No broker keepalive interval provided"))?,
             log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
+            concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup),
             concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
                 concurrent_tenant_size_logical_size_queries,
             ),
@@ -807,6 +828,11 @@ impl PageServerConf {
                 "log_format" => builder.log_format(
                     LogFormat::from_config(&parse_toml_string(key, item)?)?
                 ),
+                "concurrent_tenant_warmup" => builder.concurrent_tenant_warmup({
+                    let input = parse_toml_string(key, item)?;
+                    let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
+                    NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
+                }),
                 "concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({
                     let input = parse_toml_string(key, item)?;
                     let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
@@ -904,6 +930,10 @@ impl PageServerConf {
             broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
             broker_keepalive_interval: Duration::from_secs(5000),
             log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
+            concurrent_tenant_warmup: ConfigurableSemaphore::new(
+                NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
+                    .expect("Invalid default constant"),
+            ),
             concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
             eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(
             ),
@@ -1122,6 +1152,9 @@ background_task_maximum_delay = '334 s'
                     storage_broker::DEFAULT_KEEPALIVE_INTERVAL
                 )?,
                 log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
+                concurrent_tenant_warmup: ConfigurableSemaphore::new(
+                    NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap()
+                ),
                 concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
                 eviction_task_immitated_concurrent_logical_size_queries:
                     ConfigurableSemaphore::default(),
@@ -1188,6 +1221,9 @@ background_task_maximum_delay = '334 s'
                 broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
                 broker_keepalive_interval: Duration::from_secs(5),
                 log_format: LogFormat::Json,
+                concurrent_tenant_warmup: ConfigurableSemaphore::new(
+                    NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap()
+                ),
                 concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
                 eviction_task_immitated_concurrent_logical_size_queries:
                     ConfigurableSemaphore::default(),
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 102c9d9a6a..47092c3bb0 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -38,6 +38,7 @@ use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
+use crate::tenant::mgr::GetActiveTenantError;
 use crate::tenant::mgr::{
     GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
     TenantSlotError, TenantSlotUpsertError, TenantStateError,
@@ -67,6 +68,11 @@ use utils::{
 // Imports only used for testing APIs
 use super::models::ConfigureFailpointsRequest;
 
+// For APIs that require an Active tenant, how long should we block waiting for that state?
+// This is not functionally necessary (clients will retry), but avoids generating a lot of
+// failed API calls while tenants are activating.
+const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
+
 pub struct State {
     conf: &'static PageServerConf,
     tenant_manager: Arc<TenantManager>,
@@ -233,6 +239,19 @@ impl From<GetTenantError> for ApiError {
     }
 }
 
+impl From<GetActiveTenantError> for ApiError {
+    fn from(e: GetActiveTenantError) -> ApiError {
+        match e {
+            GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
+            GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
+            GetActiveTenantError::NotFound(gte) => gte.into(),
+            GetActiveTenantError::WaitForActiveTimeout { .. } => {
+                ApiError::ResourceUnavailable(format!("{}", e).into())
+            }
+        }
+    }
+}
+
 impl From<SetNewTenantConfigError> for ApiError {
     fn from(e: SetNewTenantConfigError) -> ApiError {
         match e {
@@ -435,7 +454,10 @@ async fn timeline_create_handler(
     let state = get_state(&request);
 
     async {
-        let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, true)?;
+        let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
+
+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
         match tenant.create_timeline(
             new_timeline_id,
             request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -694,11 +716,23 @@ async fn timeline_delete_handler(
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
     let state = get_state(&request);
 
-    state.tenant_manager.delete_timeline(tenant_shard_id, timeline_id, &ctx)
-        .instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id))
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id, false)
+        .map_err(|e| {
+            match e {
+                // GetTenantError has a built-in conversion to ApiError, but in this context we don't
+                // want to treat missing tenants as 404, to avoid ambiguity with successful deletions.
+                GetTenantError::NotFound(_) => ApiError::PreconditionFailed(
+                    "Requested tenant is missing".to_string().into_boxed_str(),
+                ),
+                e => e.into(),
+            }
+        })?;
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+    tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id))
         .await?;
 
     json_response(StatusCode::ACCEPTED, ())
@@ -1136,7 +1170,10 @@ async fn tenant_create_handler(
 
     // We created the tenant. Existing API semantics are that the tenant
     // is Active when this function returns.
-    if let res @ Err(_) = new_tenant.wait_to_become_active().await {
+    if let res @ Err(_) = new_tenant
+        .wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
+        .await
+    {
         // This shouldn't happen because we just created the tenant directory
         // in tenant::mgr::create_tenant, and there aren't any remote timelines
         // to load, so, nothing can really fail during load.
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index ba6fd00bd1..45c01b71d1 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -684,14 +684,54 @@ pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
     .expect("Failed to register pageserver_startup_is_loading")
 });
 
-/// How long did tenants take to go from construction to active state?
-pub(crate) static TENANT_ACTIVATION: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+/// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
+/// like how long it took to load.
+///
+/// Note that these are process-global metrics, _not_ per-tenant metrics.  Per-tenant
+/// metrics are rather expensive, and usually fine grained stuff makes more sense
+/// at a timeline level than tenant level.
+pub(crate) struct TenantMetrics {
+    /// How long did tenants take to go from construction to active state?
+    pub(crate) activation: Histogram,
+    pub(crate) preload: Histogram,
+    pub(crate) attach: Histogram,
+
+    /// How many tenants are included in the initial startup of the pagesrever?
+    pub(crate) startup_scheduled: IntCounter,
+    pub(crate) startup_complete: IntCounter,
+}
+
+pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {
+    TenantMetrics {
+    activation: register_histogram!(
         "pageserver_tenant_activation_seconds",
         "Time taken by tenants to activate, in seconds",
         CRITICAL_OP_BUCKETS.into()
     )
-    .expect("Failed to register pageserver_tenant_activation_seconds metric")
+    .expect("Failed to register metric"),
+    preload: register_histogram!(
+        "pageserver_tenant_preload_seconds",
+        "Time taken by tenants to load remote metadata on startup/attach, in seconds",
+        CRITICAL_OP_BUCKETS.into()
+    )
+    .expect("Failed to register metric"),
+    attach: register_histogram!(
+        "pageserver_tenant_attach_seconds",
+        "Time taken by tenants to intialize, after remote metadata is already loaded",
+        CRITICAL_OP_BUCKETS.into()
+    )
+    .expect("Failed to register metric"),
+    startup_scheduled: register_int_counter!(
+        "pageserver_tenant_startup_scheduled",
+        "Number of tenants included in pageserver startup (doesn't count tenants attached later)"
+    ).expect("Failed to register metric"),
+    startup_complete: register_int_counter!(
+        "pageserver_tenant_startup_complete",
+        "Number of tenants that have completed warm-up, or activated on-demand during initial startup: \
+         should eventually reach `pageserver_tenant_startup_scheduled_total`.  Does not include broken \
+         tenants: such cases will lead to this metric never reaching the scheduled count."
+    ).expect("Failed to register metric"),
+}
 });
 
 /// Each `Timeline`'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
@@ -2213,6 +2253,9 @@ pub fn preinitialize_metrics() {
     // Deletion queue stats
     Lazy::force(&DELETION_QUEUE);
 
+    // Tenant stats
+    Lazy::force(&TENANT);
+
     // Tenant manager stats
     Lazy::force(&TENANT_MANAGER);
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0d1be33331..1478a1a445 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -36,6 +36,8 @@ use utils::crashsafe::path_with_suffix_extension;
 use utils::fs_ext;
 use utils::sync::gate::Gate;
 use utils::sync::gate::GateGuard;
+use utils::timeout::timeout_cancellable;
+use utils::timeout::TimeoutCancellableError;
 
 use self::config::AttachedLocationConfig;
 use self::config::AttachmentMode;
@@ -59,7 +61,7 @@ use crate::deletion_queue::DeletionQueueClient;
 use crate::deletion_queue::DeletionQueueError;
 use crate::import_datadir;
 use crate::is_uninit_mark;
-use crate::metrics::TENANT_ACTIVATION;
+use crate::metrics::TENANT;
 use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC};
 use crate::repository::GcResult;
 use crate::task_mgr;
@@ -226,7 +228,7 @@ pub struct Tenant {
 
     /// The value creation timestamp, used to measure activation delay, see:
     /// <https://github.com/neondatabase/neon/issues/4025>
-    loading_started_at: Instant,
+    constructed_at: Instant,
 
     state: watch::Sender<TenantState>,
 
@@ -276,6 +278,11 @@ pub struct Tenant {
 
     eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
 
+    /// If the tenant is in Activating state, notify this to encourage it
+    /// to proceed to Active as soon as possible, rather than waiting for lazy
+    /// background warmup.
+    pub(crate) activate_now_sem: tokio::sync::Semaphore,
+
     pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
 
     // Cancellation token fires when we have entered shutdown().  This is a parent of
@@ -622,6 +629,11 @@ impl Tenant {
             "attach tenant",
             false,
             async move {
+                scopeguard::defer! {
+                    tracing::info!("Increment complete count");
+                    TENANT.startup_complete.inc();
+                }
+
                 // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
                 let make_broken =
                     |t: &Tenant, err: anyhow::Error| {
@@ -648,6 +660,56 @@ impl Tenant {
                     .as_mut()
                     .and_then(|x| x.initial_tenant_load_remote.take());
 
+                enum AttachType<'a> {
+                    // During pageserver startup, we are attaching this tenant lazily in the background
+                    Warmup(tokio::sync::SemaphorePermit<'a>),
+                    // During pageserver startup, we are attaching this tenant as soon as we can,
+                    // because a client tried to access it.
+                    OnDemand,
+                    // During normal operations after startup, we are attaching a tenant.
+                    Normal,
+                }
+
+                // Before doing any I/O, wait for either or:
+                // - A client to attempt to access to this tenant (on-demand loading)
+                // - A permit to become available in the warmup semaphore (background warmup)
+                //
+                // Some-ness of init_order is how we know if we're attaching during startup or later
+                // in process lifetime.
+                let attach_type = if init_order.is_some() {
+                    tokio::select!(
+                        _ = tenant_clone.activate_now_sem.acquire() => {
+                            tracing::info!("Activating tenant (on-demand)");
+                            AttachType::OnDemand
+                        },
+                        permit_result = conf.concurrent_tenant_warmup.inner().acquire() => {
+                            match permit_result {
+                                Ok(p) => {
+                                    tracing::info!("Activating tenant (warmup)");
+                                    AttachType::Warmup(p)
+                                }
+                                Err(_) => {
+                                    // This is unexpected: the warmup semaphore should stay alive
+                                    // for the lifetime of init_order.  Log a warning and proceed.
+                                    tracing::warn!("warmup_limit semaphore unexpectedly closed");
+                                    AttachType::Normal
+                                }
+                            }
+
+                        }
+                        _ = tenant_clone.cancel.cancelled() => {
+                            // This is safe, but should be pretty rare: it is interesting if a tenant
+                            // stayed in Activating for such a long time that shutdown found it in
+                            // that state.
+                            tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation");
+                            return Ok(());
+                        },
+                    )
+                } else {
+                    AttachType::Normal
+                };
+
+                let preload_timer = TENANT.preload.start_timer();
                 let preload = match mode {
                     SpawnMode::Create => {None},
                     SpawnMode::Normal => {
@@ -670,6 +732,7 @@ impl Tenant {
                         }
                     }
                 };
+                preload_timer.observe_duration();
 
                 // Remote preload is complete.
                 drop(remote_load_completion);
@@ -721,15 +784,39 @@ impl Tenant {
                     }
                 }
 
+                let attach_timer = TENANT.attach.start_timer();
                 match tenant_clone.attach(preload, &ctx).await {
                     Ok(()) => {
                         info!("attach finished, activating");
+                        attach_timer.observe_duration();
                         tenant_clone.activate(broker_client, None, &ctx);
                     }
                     Err(e) => {
+                        attach_timer.observe_duration();
                         make_broken(&tenant_clone, anyhow::anyhow!(e));
                     }
                 }
+
+                // If we are doing an opportunistic warmup attachment at startup, initialize
+                // logical size at the same time.  This is better than starting a bunch of idle tenants
+                // with cold caches and then coming back later to initialize their logical sizes.
+                //
+                // It also prevents the warmup proccess competing with the concurrency limit on
+                // logical size calculations: if logical size calculation semaphore is saturated,
+                // then warmup will wait for that before proceeding to the next tenant.
+                if let AttachType::Warmup(_permit) = attach_type {
+                    let mut futs = FuturesUnordered::new();
+                    let timelines: Vec<_> = tenant_clone.timelines.lock().unwrap().values().cloned().collect();
+                    for t in timelines {
+                        futs.push(t.await_initial_logical_size())
+                    }
+                    tracing::info!("Waiting for initial logical sizes while warming up...");
+                    while futs.next().await.is_some() {
+
+                    }
+                    tracing::info!("Warm-up complete");
+                }
+
                 Ok(())
             }
             .instrument({
@@ -1696,6 +1783,15 @@ impl Tenant {
         Ok(loaded_timeline)
     }
 
+    pub(crate) async fn delete_timeline(
+        self: Arc<Self>,
+        timeline_id: TimelineId,
+    ) -> Result<(), DeleteTimelineError> {
+        DeleteTimelineFlow::run(&self, timeline_id, false).await?;
+
+        Ok(())
+    }
+
     /// perform one garbage collection iteration, removing old data files from disk.
     /// this function is periodically called by gc task.
     /// also it can be explicitly requested through page server api 'do_gc' command.
@@ -1857,7 +1953,7 @@ impl Tenant {
                 );
                 *current_state = TenantState::Active;
 
-                let elapsed = self.loading_started_at.elapsed();
+                let elapsed = self.constructed_at.elapsed();
                 let total_timelines = timelines_accessor.len();
 
                 // log a lot of stuff, because some tenants sometimes suffer from user-visible
@@ -1872,7 +1968,7 @@ impl Tenant {
                     "activation attempt finished"
                 );
 
-                TENANT_ACTIVATION.observe(elapsed.as_secs_f64());
+                TENANT.activation.observe(elapsed.as_secs_f64());
             });
         }
     }
@@ -2127,18 +2223,41 @@ impl Tenant {
         self.state.subscribe()
     }
 
-    pub(crate) async fn wait_to_become_active(&self) -> Result<(), GetActiveTenantError> {
+    /// The activate_now semaphore is initialized with zero units.  As soon as
+    /// we add a unit, waiters will be able to acquire a unit and proceed.
+    pub(crate) fn activate_now(&self) {
+        self.activate_now_sem.add_permits(1);
+    }
+
+    pub(crate) async fn wait_to_become_active(
+        &self,
+        timeout: Duration,
+    ) -> Result<(), GetActiveTenantError> {
         let mut receiver = self.state.subscribe();
         loop {
             let current_state = receiver.borrow_and_update().clone();
             match current_state {
                 TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => {
                     // in these states, there's a chance that we can reach ::Active
-                    receiver.changed().await.map_err(
-                        |_e: tokio::sync::watch::error::RecvError|
-                            // Tenant existed but was dropped: report it as non-existent
-                            GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id))
-                    )?;
+                    self.activate_now();
+                    match timeout_cancellable(timeout, &self.cancel, receiver.changed()).await {
+                        Ok(r) => {
+                            r.map_err(
+                            |_e: tokio::sync::watch::error::RecvError|
+                                // Tenant existed but was dropped: report it as non-existent
+                                GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id))
+                        )?
+                        }
+                        Err(TimeoutCancellableError::Cancelled) => {
+                            return Err(GetActiveTenantError::Cancelled);
+                        }
+                        Err(TimeoutCancellableError::Timeout) => {
+                            return Err(GetActiveTenantError::WaitForActiveTimeout {
+                                latest_state: Some(self.current_state()),
+                                wait_time: timeout,
+                            });
+                        }
+                    }
                 }
                 TenantState::Active { .. } => {
                     return Ok(());
@@ -2463,7 +2582,7 @@ impl Tenant {
             conf,
             // using now here is good enough approximation to catch tenants with really long
             // activation times.
-            loading_started_at: Instant::now(),
+            constructed_at: Instant::now(),
             tenant_conf: Arc::new(RwLock::new(attached_conf)),
             timelines: Mutex::new(HashMap::new()),
             timelines_creating: Mutex::new(HashSet::new()),
@@ -2475,6 +2594,7 @@ impl Tenant {
             cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
             cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
             eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
+            activate_now_sem: tokio::sync::Semaphore::new(0),
             delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
             cancel: CancellationToken::default(),
             gate: Gate::new(format!("Tenant<{tenant_shard_id}>")),
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index f53951e1d3..b2f14db9f7 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -28,7 +28,7 @@ use crate::control_plane_client::{
     ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
 };
 use crate::deletion_queue::DeletionQueueClient;
-use crate::metrics::TENANT_MANAGER as METRICS;
+use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{
     AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt,
@@ -44,7 +44,6 @@ use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};
 
 use super::delete::DeleteTenantError;
-use super::timeline::delete::DeleteTimelineFlow;
 use super::TenantSharedResources;
 
 /// For a tenant that appears in TenantsMap, it may either be
@@ -430,6 +429,13 @@ pub async fn init_tenant_mgr(
     let tenant_generations =
         init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
 
+    tracing::info!(
+        "Attaching {} tenants at startup, warming up {} at a time",
+        tenant_configs.len(),
+        conf.concurrent_tenant_warmup.initial_permits()
+    );
+    TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);
+
     // Construct `Tenant` objects and start them running
     for (tenant_shard_id, location_conf) in tenant_configs {
         let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
@@ -848,17 +854,6 @@ impl TenantManager {
         }
     }
 
-    pub(crate) async fn delete_timeline(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        _ctx: &RequestContext,
-    ) -> Result<(), DeleteTimelineError> {
-        let tenant = self.get_attached_tenant_shard(tenant_shard_id, true)?;
-        DeleteTimelineFlow::run(&tenant, timeline_id, false).await?;
-        Ok(())
-    }
-
     #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
     pub(crate) async fn upsert_location(
         &self,
@@ -1221,7 +1216,10 @@ pub(crate) async fn get_active_tenant_with_timeout(
                         // Fast path: we don't need to do any async waiting.
                         return Ok(tenant.clone());
                     }
-                    _ => (WaitFor::Tenant(tenant.clone()), tenant_shard_id),
+                    _ => {
+                        tenant.activate_now();
+                        (WaitFor::Tenant(tenant.clone()), tenant_shard_id)
+                    }
                 }
             }
             Some(TenantSlot::Secondary) => {
@@ -1275,28 +1273,10 @@ pub(crate) async fn get_active_tenant_with_timeout(
     };
 
     tracing::debug!("Waiting for tenant to enter active state...");
-    match timeout_cancellable(
-        deadline.duration_since(Instant::now()),
-        cancel,
-        tenant.wait_to_become_active(),
-    )
-    .await
-    {
-        Ok(Ok(())) => Ok(tenant),
-        Ok(Err(e)) => Err(e),
-        Err(TimeoutCancellableError::Timeout) => {
-            let latest_state = tenant.current_state();
-            if latest_state == TenantState::Active {
-                Ok(tenant)
-            } else {
-                Err(GetActiveTenantError::WaitForActiveTimeout {
-                    latest_state: Some(latest_state),
-                    wait_time: timeout,
-                })
-            }
-        }
-        Err(TimeoutCancellableError::Cancelled) => Err(GetActiveTenantError::Cancelled),
-    }
+    tenant
+        .wait_to_become_active(deadline.duration_since(Instant::now()))
+        .await?;
+    Ok(tenant)
 }
 
 pub(crate) async fn delete_tenant(
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 7438215a68..1e84fa1848 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1734,6 +1734,7 @@ impl Timeline {
                 self.current_logical_size.current_size().accuracy(),
                 logical_size::Accuracy::Exact,
             );
+            self.current_logical_size.initialized.add_permits(1);
             return;
         };
 
@@ -1779,6 +1780,11 @@ impl Timeline {
         cancel: CancellationToken,
         background_ctx: RequestContext,
     ) {
+        scopeguard::defer! {
+            // Irrespective of the outcome of this operation, we should unblock anyone waiting for it.
+            self.current_logical_size.initialized.add_permits(1);
+        }
+
         enum BackgroundCalculationError {
             Cancelled,
             Other(anyhow::Error),
@@ -3104,6 +3110,32 @@ impl Timeline {
 
         Ok(image_layers)
     }
+
+    /// Wait until the background initial logical size calculation is complete, or
+    /// this Timeline is shut down.  Calling this function will cause the initial
+    /// logical size calculation to skip waiting for the background jobs barrier.
+    pub(crate) async fn await_initial_logical_size(self: Arc<Self>) {
+        if let Some(await_bg_cancel) = self
+            .current_logical_size
+            .cancel_wait_for_background_loop_concurrency_limit_semaphore
+            .get()
+        {
+            await_bg_cancel.cancel();
+        } else {
+            // We should not wait if we were not able to explicitly instruct
+            // the logical size cancellation to skip the concurrency limit semaphore.
+            // TODO: this is an unexpected case.  We should restructure so that it
+            // can't happen.
+            tracing::info!(
+                "await_initial_logical_size: can't get semaphore cancel token, skipping"
+            );
+        }
+
+        tokio::select!(
+            _ = self.current_logical_size.initialized.acquire() => {},
+            _ = self.cancel.cancelled() => {}
+        )
+    }
 }
 
 #[derive(Default)]
diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs
index f2db8c91fc..03bc59ea38 100644
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -34,6 +34,9 @@ pub(super) struct LogicalSize {
     pub(crate) cancel_wait_for_background_loop_concurrency_limit_semaphore:
         OnceCell<CancellationToken>,
 
+    /// Once the initial logical size is initialized, this is notified.
+    pub(crate) initialized: tokio::sync::Semaphore,
+
     /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
     pub initial_part_end: Option<Lsn>,
 
@@ -125,6 +128,7 @@ impl LogicalSize {
             initial_part_end: None,
             size_added_after_initial: AtomicI64::new(0),
             did_return_approximate_to_walreceiver: AtomicBool::new(false),
+            initialized: tokio::sync::Semaphore::new(0),
         }
     }
 
@@ -135,6 +139,7 @@ impl LogicalSize {
             initial_part_end: Some(compute_to),
             size_added_after_initial: AtomicI64::new(0),
             did_return_approximate_to_walreceiver: AtomicBool::new(false),
+            initialized: tokio::sync::Semaphore::new(0),
         }
     }
 
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 24cbe34457..6e510b2eba 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -300,7 +300,8 @@ def test_timeline_initial_logical_size_calculation_cancellation(
     env = neon_env_builder.init_start()
     client = env.pageserver.http_client()
 
-    tenant_id, timeline_id = env.neon_cli.create_tenant()
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
 
     # load in some data
     endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
@@ -732,3 +733,142 @@ def wait_for_timeline_size_init(
     raise Exception(
         f"timed out while waiting for current_logical_size of a timeline to reach its non-incremental value, details: {timeline_details}"
     )
+
+
+def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
+    """
+    Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete
+    before proceeding.  However, they skip this if a client is actively trying to access them.
+
+    This test is not purely about logical sizes, but logical size calculation is the phase that we
+    use as a proxy for "warming up" in this test: it happens within the semaphore guard used
+    to limit concurrent tenant warm-up.
+    """
+
+    # We will run with the limit set to 1, so that once we have one tenant stuck
+    # in a pausable failpoint, the rest are prevented from proceeding through warmup.
+    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
+
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()
+
+    # Create some tenants
+    n_tenants = 10
+    tenant_ids = {env.initial_tenant}
+    for _i in range(0, n_tenants - 1):
+        tenant_id = TenantId.generate()
+        env.pageserver.tenant_create(tenant_id)
+
+        # Empty tenants are not subject to waiting for logical size calculations, because
+        # those hapen on timeline level
+        timeline_id = TimelineId.generate()
+        env.neon_cli.create_timeline(
+            new_branch_name="main", tenant_id=tenant_id, timeline_id=timeline_id
+        )
+
+        tenant_ids.add(tenant_id)
+
+    # Restart pageserver with logical size calculations paused
+    env.pageserver.stop()
+    env.pageserver.start(
+        extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"}
+    )
+
+    def get_tenant_states():
+        states = {}
+        for tenant_id in tenant_ids:
+            tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
+            states[tenant_id] = tenant["state"]["slug"]
+        log.info(f"Tenant states: {states}")
+        return states
+
+    def at_least_one_active():
+        assert "Active" in set(get_tenant_states().values())
+
+    # One tenant should activate, then get stuck in their logical size calculation
+    wait_until(10, 1, at_least_one_active)
+
+    # Wait some walltime to gain confidence that other tenants really are stuck and not proceeding to activate
+    time.sleep(5)
+
+    # We should see one tenant win the activation race, and enter logical size calculation.  The rest
+    # will stay in Attaching state, waiting for the "warmup_limit" semaphore
+    expect_activated = 1
+    states = get_tenant_states()
+    assert len([s for s in states.values() if s == "Active"]) == expect_activated
+    assert len([s for s in states.values() if s == "Attaching"]) == n_tenants - expect_activated
+
+    assert (
+        pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
+    )
+
+    # This is zero, and subsequent checks are expect_activated - 1, because this counter does not
+    # count how may tenants are Active, it counts how many have finished warmup.  The first tenant
+    # that reached Active is still stuck in its local size calculation, and has therefore not finished warmup.
+    assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == 0
+
+    # If a client accesses one of the blocked tenants, it should skip waiting for warmup and
+    # go active as fast as it can.
+    stuck_tenant_id = list(
+        [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"]
+    )[0][0]
+
+    endpoint = env.endpoints.create_start(branch_name="main", tenant_id=stuck_tenant_id)
+    endpoint.safe_psql_many(
+        [
+            "CREATE TABLE foo (x INTEGER)",
+            "INSERT INTO foo SELECT g FROM generate_series(1, 10) g",
+        ]
+    )
+    endpoint.stop()
+
+    # That one that we successfully accessed is now Active
+    expect_activated += 1
+    assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active"
+    assert (
+        pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
+        == expect_activated - 1
+    )
+
+    # The ones we didn't touch are still in Attaching
+    assert (
+        len([s for s in get_tenant_states().values() if s == "Attaching"])
+        == n_tenants - expect_activated
+    )
+
+    # Timeline creation operations also wake up Attaching tenants
+    stuck_tenant_id = list(
+        [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"]
+    )[0][0]
+    pageserver_http.timeline_create(env.pg_version, stuck_tenant_id, TimelineId.generate())
+    expect_activated += 1
+    assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active"
+    assert (
+        len([s for s in get_tenant_states().values() if s == "Attaching"])
+        == n_tenants - expect_activated
+    )
+
+    assert (
+        pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
+        == expect_activated - 1
+    )
+
+    # When we unblock logical size calculation, all tenants should proceed to active state via
+    # the warmup route.
+    pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))
+
+    def all_active():
+        assert all(s == "Active" for s in get_tenant_states().values())
+
+    wait_until(10, 1, all_active)
+
+    # Final control check: restarting with no failpoints at all results in all tenants coming active
+    # without being prompted by client I/O
+    env.pageserver.stop()
+    env.pageserver.start()
+    wait_until(10, 1, all_active)
+
+    assert (
+        pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
+    )
+    assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants

From 83811491da80794a6cd89827db080c79a83def2f Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Sat, 16 Dec 2023 09:06:00 +0000
Subject: [PATCH 12/19] update zerocopy (#6148)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

https://github.com/neondatabase/neon/security/dependabot/48

```
$ cargo tree -i zerocopy
zerocopy v0.7.3
└── ahash v0.8.5
    └── hashbrown v0.13.2
```

ahash doesn't use the affected APIs we we are not vulnerable but best to
update to silence the alert anyway

## Summary of changes

```
$ cargo update -p zerocopy --precise 0.7.31
    Updating crates.io index
    Updating syn v2.0.28 -> v2.0.32
    Updating zerocopy v0.7.3 -> v0.7.31
    Updating zerocopy-derive v0.7.3 -> v0.7.31
```
---
 Cargo.lock | 56 +++++++++++++++++++++++++++---------------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 99a936dd40..96f6b494c4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -233,7 +233,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -244,7 +244,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -881,7 +881,7 @@ dependencies = [
  "regex",
  "rustc-hash",
  "shlex",
- "syn 2.0.28",
+ "syn 2.0.32",
  "which",
 ]
 
@@ -1095,7 +1095,7 @@ dependencies = [
  "heck",
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -1486,7 +1486,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "strsim",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -1497,7 +1497,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a"
 dependencies = [
  "darling_core",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -1572,7 +1572,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -1666,7 +1666,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -1920,7 +1920,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -2906,7 +2906,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -3349,7 +3349,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -3556,7 +3556,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
 dependencies = [
  "proc-macro2",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -4164,7 +4164,7 @@ dependencies = [
  "regex",
  "relative-path",
  "rustc_version",
- "syn 2.0.28",
+ "syn 2.0.32",
  "unicode-ident",
 ]
 
@@ -4599,7 +4599,7 @@ checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -4680,7 +4680,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -4947,9 +4947,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.28"
+version = "2.0.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567"
+checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -5079,7 +5079,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -5197,7 +5197,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -5498,7 +5498,7 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]
@@ -5943,7 +5943,7 @@ dependencies = [
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
  "wasm-bindgen-shared",
 ]
 
@@ -5977,7 +5977,7 @@ checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
@@ -6314,7 +6314,7 @@ dependencies = [
  "smallvec",
  "subtle",
  "syn 1.0.109",
- "syn 2.0.28",
+ "syn 2.0.32",
  "time",
  "time-macros",
  "tokio",
@@ -6376,22 +6376,22 @@ dependencies = [
 
 [[package]]
 name = "zerocopy"
-version = "0.7.3"
+version = "0.7.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a7af71d8643341260a65f89fa60c0eeaa907f34544d8f6d9b0df72f069b5e74"
+checksum = "1c4061bedbb353041c12f413700357bec76df2c7e2ca8e4df8bac24c6bf68e3d"
 dependencies = [
  "zerocopy-derive",
 ]
 
 [[package]]
 name = "zerocopy-derive"
-version = "0.7.3"
+version = "0.7.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9731702e2f0617ad526794ae28fbc6f6ca8849b5ba729666c2a5bc4b6ddee2cd"
+checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]
 
 [[package]]

From 8619e6295a8a32f20282cf8a264f1d1e019a2e05 Mon Sep 17 00:00:00 2001
From: Abhijeet Patil <abhi.gets.mail@gmail.com>
Date: Sat, 16 Dec 2023 10:33:52 +0000
Subject: [PATCH 13/19] CI: build build-tools image (#6082)

## Currently our build docker file is located in the build repo it makes
sense to have it as a part of our neon repo

## Summary of changes
We had the docker file that we use to build our binary and other tools
resided in the build repo
It made sense to bring the docker file to its repo where it has been
used
So that the contributors can also view it and amend if required
It will reduce the maintenance. Docker file changes and code changes can
be accommodated in same PR
Also, building the image and pushing it to ECR is abstracted in a
reusable workflow. Ideal is to use that for any other jobs too

## Checklist before requesting a review

- [x] Moved the docker file used to build the binary from the build repo
to the neon repo
- [x] adding gh workflow to build and push the image
- [x] adding gh workflow to tag the pushed image
- [x] update readMe file

---------

Co-authored-by: Abhijeet Patil <abhijeet@neon.tech>
Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .../workflows/build_and_push_docker_image.yml |  57 ++++++
 .github/workflows/build_and_test.yml          |  82 +++++++--
 .../workflows/update_build_tools_image.yml    | 130 ++++++++++++++
 .gitignore                                    |   1 +
 CONTRIBUTING.md                               |  14 ++
 Dockerfile                                    |   2 +-
 Dockerfile.buildtools                         | 165 ++++++++++++++++++
 Dockerfile.compute-node                       |   2 +-
 Dockerfile.compute-tools                      |   2 +-
 9 files changed, 434 insertions(+), 21 deletions(-)
 create mode 100644 .github/workflows/build_and_push_docker_image.yml
 create mode 100644 .github/workflows/update_build_tools_image.yml
 create mode 100644 Dockerfile.buildtools

diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml
new file mode 100644
index 0000000000..8780c0fef7
--- /dev/null
+++ b/.github/workflows/build_and_push_docker_image.yml
@@ -0,0 +1,57 @@
+name: Build and Push Docker Image
+
+on:
+  workflow_call:
+    inputs:
+      dockerfile-path:
+        required: true
+        type: string
+      image-name:
+        required: true
+        type: string
+      tag:
+        required: true
+        type: string
+
+jobs:
+  kaniko:
+    runs-on: [ self-hosted, dev, x64 ]
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v1
+
+      - name: Configure ECR login
+        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+
+      - name: Kaniko build
+        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ inputs.tag }}-amd64
+
+  kaniko-arm:
+    runs-on: [ self-hosted, dev, arm64 ]
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v1
+
+      - name: Configure ECR login
+        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+
+      - name: Kaniko build
+        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ inputs.tag }}-arm64
+
+  manifest:
+    name: 'manifest'
+    runs-on: [ self-hosted, dev, x64 ]
+    needs:
+      - kaniko
+      - kaniko-arm
+
+    steps:
+      - name: Create manifest
+        run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ inputs.tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ inputs.tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ inputs.tag }}-arm64
+
+      - name: Push manifest
+        run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ inputs.tag }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 693ed1a66f..808d39b33c 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -44,13 +44,13 @@ jobs:
 
         exit 1
 
-
   tag:
     needs: [ check-permissions ]
     runs-on: [ self-hosted, gen3, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
     outputs:
       build-tag: ${{steps.build-tag.outputs.tag}}
+      build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
 
     steps:
       - name: Checkout
@@ -74,11 +74,50 @@ jobs:
         shell: bash
         id: build-tag
 
-  check-codestyle-python:
+      - name: Get buildtools tag
+        run: |
+          if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]]; then
+            IMAGE_TAG=$GITHUB_RUN_ID
+          else
+            IMAGE_TAG=pinned
+          fi
+          
+          echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
+        shell: bash
+        id: buildtools-tag
+
+  check-if-build-tools-dockerfile-changed:
     needs: [ check-permissions ]
+    runs-on: ubuntu-latest
+    outputs:
+      docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
+    steps:
+      - name: Check if Dockerfile.buildtools has changed
+        id: dockerfile
+        run: |
+          updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
+          if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
+            echo "docker_file_changed=true" >> $GITHUB_OUTPUT
+          fi
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+  build-buildtools-image:
+    needs: [ check-permissions, check-if-build-tools-dockerfile-changed, tag]
+    uses: ./.github/workflows/build_and_push_docker_image.yml
+    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
+    with:
+      dockerfile-path: Dockerfile.buildtools
+      image-name: build-tools
+      tag: ${{ needs.tag.outputs.build-tools-tag}}
+    secrets: inherit
+
+  check-codestyle-python:
+    if: always() && contains(fromJSON('["success", "skipped"]'), needs.build-buildtools-image.result)
+    needs: [ check-permissions, build-buildtools-image , tag ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
       options: --init
 
     steps:
@@ -108,10 +147,11 @@ jobs:
         run: poetry run mypy .
 
   check-codestyle-rust:
-    needs: [ check-permissions ]
+    if: always() && contains(fromJSON('["success", "skipped"]'), needs.build-buildtools-image.result)
+    needs: [ check-permissions, build-buildtools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
       options: --init
 
     steps:
@@ -175,10 +215,11 @@ jobs:
         run: cargo deny check --hide-inclusion-graph
 
   build-neon:
-    needs: [ check-permissions, tag ]
+    if: always() && contains(fromJSON('["success", "skipped"]'), needs.build-buildtools-image.result)
+    needs: [ check-permissions, tag, build-buildtools-image ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
       options: --init
     strategy:
       fail-fast: false
@@ -411,7 +452,7 @@ jobs:
     needs: [ check-permissions, build-neon, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
       # Default shared memory is 64mb
       options: --init --shm-size=512mb
     strategy:
@@ -447,10 +488,10 @@ jobs:
         uses: ./.github/actions/save-coverage-data
 
   benchmarks:
-    needs: [ check-permissions, build-neon ]
+    needs: [ check-permissions, build-neon, tag]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
       # Default shared memory is 64mb
       options: --init --shm-size=512mb
     if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
@@ -479,12 +520,12 @@ jobs:
       # while coverage is currently collected for the debug ones
 
   create-test-report:
-    needs: [ check-permissions, regress-tests, coverage-report, benchmarks ]
+    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, tag ]
     if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
 
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
       options: --init
 
     steps:
@@ -526,11 +567,10 @@ jobs:
             })
 
   coverage-report:
-    needs: [ check-permissions, regress-tests ]
-
+    needs: [ check-permissions, regress-tests, tag ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
       options: --init
     strategy:
       fail-fast: false
@@ -694,7 +734,8 @@ jobs:
             }"
 
   neon-image:
-    needs: [ check-permissions, tag ]
+    if: always() && contains(fromJSON('["success", "skipped"]'), needs.build-buildtools-image.result)
+    needs: [ check-permissions, build-buildtools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container: gcr.io/kaniko-project/executor:v1.9.2-debug
     defaults:
@@ -733,6 +774,7 @@ jobs:
                            --context .
                            --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                            --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
+                           --build-arg TAG=${{ needs.tag.outputs.build-tools-tag }}
                            --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
                            --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
@@ -742,8 +784,9 @@ jobs:
         run: rm -rf ~/.ecr
 
   compute-tools-image:
+    if: always() && contains(fromJSON('["success", "skipped"]'), needs.build-buildtools-image.result)
     runs-on: [ self-hosted, gen3, large ]
-    needs: [ check-permissions, tag ]
+    needs: [ check-permissions, build-buildtools-image, tag ]
     container: gcr.io/kaniko-project/executor:v1.9.2-debug
     defaults:
       run:
@@ -778,6 +821,7 @@ jobs:
                            --context .
                            --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                            --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
+                           --build-arg TAG=${{needs.tag.outputs.build-tools-tag}}
                            --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                            --dockerfile Dockerfile.compute-tools
                            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
@@ -788,7 +832,8 @@ jobs:
         run: rm -rf ~/.ecr
 
   compute-node-image:
-    needs: [ check-permissions, tag ]
+    if: always() && contains(fromJSON('["success", "skipped"]'), needs.build-buildtools-image.result)
+    needs: [ check-permissions, build-buildtools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container:
       image: gcr.io/kaniko-project/executor:v1.9.2-debug
@@ -836,6 +881,7 @@ jobs:
                            --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                            --build-arg PG_VERSION=${{ matrix.version }}
                            --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
+                           --build-arg TAG=${{needs.tag.outputs.build-tools-tag}}
                            --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                            --dockerfile Dockerfile.compute-node
                            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
diff --git a/.github/workflows/update_build_tools_image.yml b/.github/workflows/update_build_tools_image.yml
new file mode 100644
index 0000000000..88bab797b7
--- /dev/null
+++ b/.github/workflows/update_build_tools_image.yml
@@ -0,0 +1,130 @@
+name: 'Update build tools image tag'
+
+# This workflow it used to update tag of build tools in ECR.
+# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
+
+on:
+  workflow_dispatch:
+    inputs:
+      from-tag:
+        description: 'Source tag'
+        required: true
+        type: string
+      to-tag:
+        description: 'Destination tag'
+        required: true
+        type: string
+        default: 'pinned'
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+env:
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+permissions: {}
+
+jobs:
+  tag-image:
+    runs-on: [ self-hosted, gen3, small ]
+    container: golang:1.19-bullseye
+
+    env:
+      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
+      FROM_TAG: ${{ inputs.from-tag }}
+      TO_TAG: ${{ inputs.to-tag }}
+    outputs:
+      next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
+      prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
+
+    steps:
+      - name: Install Crane & ECR helper
+        run: |
+          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
+          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
+
+      - name: Configure ECR login
+        run: |
+          mkdir /github/home/.docker/
+          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+
+      - name: Get source image digest
+        id: next-digest
+        run: |
+          NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
+          if [ -z "${NEXT_DIGEST}" ]; then
+            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
+            exit 1
+          fi
+
+          echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
+          echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
+
+      - name: Get destination image digest (if already exists)
+        id: prev-digest
+        run: |
+          PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
+          if [ -z "${PREV_DIGEST}" ]; then
+            echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
+          else
+            echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
+
+            echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Tag image
+        run: |
+          crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
+
+  rollback-tag-image:
+    needs:  tag-image
+    if: ${{ !success() }}
+
+    runs-on: [ self-hosted, gen3, small ]
+    container: golang:1.19-bullseye
+
+    env:
+      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
+      FROM_TAG: ${{ inputs.from-tag }}
+      TO_TAG: ${{ inputs.to-tag }}
+
+    steps:
+      - name: Install Crane & ECR helper
+        run: |
+          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
+          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
+
+      - name: Configure ECR login
+        run: |
+          mkdir /github/home/.docker/
+          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+
+      - name: Restore previous tag if needed
+        run: |
+          NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
+          PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
+
+          if [ -z "${NEXT_DIGEST}" ]; then
+            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
+            exit 0
+          fi
+
+          if [ -z "${PREV_DIGEST}" ]; then
+            # I guess we should delete the tag here/untag the image, but crane does not support it
+            # - https://github.com/google/go-containerregistry/issues/999
+
+            echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
+
+            exit 0
+          fi
+
+          CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
+          if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
+            crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
+
+            echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
+          else
+            echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
+          fi
diff --git a/.gitignore b/.gitignore
index c5fc121ac2..3f4495c9e7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@ __pycache__/
 test_output/
 .vscode
 .idea
+neon.iml
 /.neon
 /integration_tests/.neon
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2692684006..b318c295a3 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -70,3 +70,17 @@ We're using the following approach to make it work:
 - The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review)
 
 For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
+
+## How do I add the "pinned" tag to an buildtools image?
+We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.
+
+You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
+or using GitHub CLI:
+
+```bash
+gh workflow -R neondatabase/neon run update_build_tools_image.yml \
+            -f from-tag=6254913013 \
+            -f to-tag=pinned \
+
+# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
+```
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 60de9cfa3e..5d5fde4f14 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,7 +3,7 @@
 ### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used
 ### inside this image in the real deployments.
 ARG REPOSITORY=neondatabase
-ARG IMAGE=rust
+ARG IMAGE=build-tools
 ARG TAG=pinned
 
 # Build Postgres
diff --git a/Dockerfile.buildtools b/Dockerfile.buildtools
new file mode 100644
index 0000000000..d3d05b4e20
--- /dev/null
+++ b/Dockerfile.buildtools
@@ -0,0 +1,165 @@
+FROM debian:bullseye-slim
+
+# Add nonroot user
+RUN useradd -ms /bin/bash nonroot -b /home
+SHELL ["/bin/bash", "-c"]
+
+# System deps
+RUN set -e \
+    && apt update \
+    && apt install -y \
+        autoconf \
+        automake \
+        bison \
+        build-essential \
+        ca-certificates \
+        cmake \
+        curl \
+        flex \
+        git \
+        gnupg \
+        gzip \
+        jq \
+        libcurl4-openssl-dev \
+        libbz2-dev \
+        libffi-dev \
+        liblzma-dev \
+        libncurses5-dev \
+        libncursesw5-dev \
+        libpq-dev \
+        libreadline-dev \
+        libseccomp-dev \
+        libsqlite3-dev \
+        libssl-dev \
+        libstdc++-10-dev \
+        libtool \
+        libxml2-dev \
+        libxmlsec1-dev \
+        libxxhash-dev \
+        lsof \
+        make \
+        netcat \
+        net-tools \
+        openssh-client \
+        parallel \
+        pkg-config \
+        unzip \
+        wget \
+        xz-utils \
+        zlib1g-dev \
+        zstd \
+    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# protobuf-compiler (protoc)
+ENV PROTOC_VERSION 22.2
+RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
+    && unzip -q protoc.zip -d protoc \
+    && mv protoc/bin/protoc /usr/local/bin/protoc \
+    && mv protoc/include/google /usr/local/include/google \
+    && rm -rf protoc.zip protoc
+
+# LLVM
+ENV LLVM_VERSION=17
+RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
+    && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
+    && apt update \
+    && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
+    && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
+    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# PostgreSQL 14
+RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \
+    && echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \
+    && apt update \
+    && apt install -y postgresql-client-14 \
+    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# AWS CLI
+RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
+    && unzip -q awscliv2.zip \
+    && ./aws/install \
+    && rm awscliv2.zip
+
+# Mold: A Modern Linker
+ENV MOLD_VERSION v2.1.0
+RUN set -e \
+    && git clone https://github.com/rui314/mold.git \
+    && mkdir mold/build \
+    && cd mold/build \
+    && git checkout ${MOLD_VERSION} \
+    && cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang++ .. \
+    && cmake --build . -j $(nproc) \
+    && cmake --install . \
+    && cd .. \
+    && rm -rf mold
+
+# LCOV
+# Build lcov from a fork:
+# It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
+# And patches from us:
+# - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
+RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
+    && wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
+    && echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992  lcov.tar.gz" | sha256sum --check \
+    && mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
+    && cd lcov \
+    && make install \
+    && rm -rf ../lcov.tar.gz
+
+# Switch to nonroot user
+USER nonroot:nonroot
+WORKDIR /home/nonroot
+
+# Python
+ENV PYTHON_VERSION=3.9.2 \
+    PYENV_ROOT=/home/nonroot/.pyenv \
+    PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
+RUN set -e \
+    && cd $HOME \
+    && curl -sSO https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer \
+    && chmod +x pyenv-installer \
+    && ./pyenv-installer \
+    && export PYENV_ROOT=/home/nonroot/.pyenv \
+    && export PATH="$PYENV_ROOT/bin:$PATH" \
+    && export PATH="$PYENV_ROOT/shims:$PATH" \
+    && pyenv install ${PYTHON_VERSION} \
+    && pyenv global ${PYTHON_VERSION} \
+    && python --version \
+    && pip install --upgrade pip \
+    && pip --version \
+    && pip install pipenv wheel poetry
+
+# Switch to nonroot user (again)
+USER nonroot:nonroot
+WORKDIR /home/nonroot
+
+# Rust
+# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
+ENV RUSTC_VERSION=1.74.0
+ENV RUSTUP_HOME="/home/nonroot/.rustup"
+ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
+RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
+	chmod +x rustup-init && \
+	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
+	rm rustup-init && \
+    export PATH="$HOME/.cargo/bin:$PATH" && \
+    . "$HOME/.cargo/env" && \
+    cargo --version && rustup --version && \
+    rustup component add llvm-tools-preview rustfmt clippy && \
+    cargo install --git https://github.com/paritytech/cachepot && \
+    cargo install rustfilt && \
+    cargo install cargo-hakari && \
+    cargo install cargo-deny && \
+    cargo install cargo-hack && \
+    rm -rf /home/nonroot/.cargo/registry && \
+    rm -rf /home/nonroot/.cargo/git
+ENV RUSTC_WRAPPER=cachepot
+
+# Show versions
+RUN whoami \
+    && python --version \
+    && pip --version \
+    && cargo --version --verbose \
+    && rustup --version --verbose \
+    && rustc --version --verbose \
+    && clang --version
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 03280586f8..3809eb09a9 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -1,6 +1,6 @@
 ARG PG_VERSION
 ARG REPOSITORY=neondatabase
-ARG IMAGE=rust
+ARG IMAGE=build-tools
 ARG TAG=pinned
 ARG BUILD_TAG
 
diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools
index 3066e3f7ca..cc305cc556 100644
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -1,7 +1,7 @@
 # First transient image to build compute_tools binaries
 # NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
 ARG REPOSITORY=neondatabase
-ARG IMAGE=rust
+ARG IMAGE=build-tools
 ARG TAG=pinned
 ARG BUILD_TAG
 

From 47873470db040b55fbf447a6c392f9a2b1091a02 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Sat, 16 Dec 2023 11:52:48 +0100
Subject: [PATCH 14/19] pageserver: add method to dump keyspace in mgmt api
 client (#6145)

Part of getpage@lsn benchmark epic:
https://github.com/neondatabase/neon/issues/5771
---
 Cargo.lock                                    |   1 +
 libs/pageserver_api/Cargo.toml                |   1 +
 libs/pageserver_api/src/key.rs                |  34 ++++
 .../pageserver_api}/src/keyspace.rs           |  32 +++-
 libs/pageserver_api/src/lib.rs                |   1 +
 libs/pageserver_api/src/models.rs             |   2 +
 .../pageserver_api/src/models/partitioning.rs | 151 ++++++++++++++++++
 pageserver/client/src/mgmt_api.rs             |  16 ++
 pageserver/src/http/mod.rs                    |   2 -
 pageserver/src/http/routes.rs                 |  77 +--------
 pageserver/src/lib.rs                         |   2 +-
 pageserver/src/repository.rs                  |  29 +---
 12 files changed, 246 insertions(+), 102 deletions(-)
 rename {pageserver => libs/pageserver_api}/src/keyspace.rs (93%)
 create mode 100644 libs/pageserver_api/src/models/partitioning.rs

diff --git a/Cargo.lock b/Cargo.lock
index 96f6b494c4..f931fd6c29 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3167,6 +3167,7 @@ dependencies = [
  "enum-map",
  "hex",
  "postgres_ffi",
+ "rand 0.8.5",
  "serde",
  "serde_json",
  "serde_with",
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index 4d08d78e87..4146597d8d 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -24,3 +24,4 @@ workspace_hack.workspace = true
 
 [dev-dependencies]
 bincode.workspace = true
+rand.workspace = true
diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 16715bc667..d680a5600e 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -144,3 +144,37 @@ impl Key {
 pub fn is_rel_block_key(key: &Key) -> bool {
     key.field1 == 0x00 && key.field4 != 0
 }
+
+impl std::str::FromStr for Key {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
+        Self::from_hex(s)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::str::FromStr;
+
+    use crate::key::Key;
+
+    use rand::Rng;
+    use rand::SeedableRng;
+
+    #[test]
+    fn display_fromstr_bijection() {
+        let mut rng = rand::rngs::StdRng::seed_from_u64(42);
+
+        let key = Key {
+            field1: rng.gen(),
+            field2: rng.gen(),
+            field3: rng.gen(),
+            field4: rng.gen(),
+            field5: rng.gen(),
+            field6: rng.gen(),
+        };
+
+        assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
+    }
+}
diff --git a/pageserver/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
similarity index 93%
rename from pageserver/src/keyspace.rs
rename to libs/pageserver_api/src/keyspace.rs
index 20e6df9c7b..16651c322e 100644
--- a/pageserver/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -1,11 +1,12 @@
-use crate::repository::{key_range_size, singleton_range, Key};
 use postgres_ffi::BLCKSZ;
 use std::ops::Range;
 
+use crate::key::Key;
+
 ///
 /// Represents a set of Keys, in a compact form.
 ///
-#[derive(Clone, Debug, Default)]
+#[derive(Clone, Debug, Default, PartialEq, Eq)]
 pub struct KeySpace {
     /// Contiguous ranges of keys that belong to the key space. In key order,
     /// and with no overlap.
@@ -186,6 +187,33 @@ impl KeySpaceRandomAccum {
     }
 }
 
+pub fn key_range_size(key_range: &Range<Key>) -> u32 {
+    let start = key_range.start;
+    let end = key_range.end;
+
+    if end.field1 != start.field1
+        || end.field2 != start.field2
+        || end.field3 != start.field3
+        || end.field4 != start.field4
+    {
+        return u32::MAX;
+    }
+
+    let start = (start.field5 as u64) << 32 | start.field6 as u64;
+    let end = (end.field5 as u64) << 32 | end.field6 as u64;
+
+    let diff = end - start;
+    if diff > u32::MAX as u64 {
+        u32::MAX
+    } else {
+        diff as u32
+    }
+}
+
+pub fn singleton_range(key: Key) -> Range<Key> {
+    key..key.next()
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs
index 511c5ed208..b236b93428 100644
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -5,6 +5,7 @@ use const_format::formatcp;
 /// Public API types
 pub mod control_api;
 pub mod key;
+pub mod keyspace;
 pub mod models;
 pub mod reltag;
 pub mod shard;
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index fbc7d73235..a78ba8ad94 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1,3 +1,5 @@
+pub mod partitioning;
+
 use std::{
     collections::HashMap,
     num::{NonZeroU64, NonZeroUsize},
diff --git a/libs/pageserver_api/src/models/partitioning.rs b/libs/pageserver_api/src/models/partitioning.rs
new file mode 100644
index 0000000000..0d287f7be0
--- /dev/null
+++ b/libs/pageserver_api/src/models/partitioning.rs
@@ -0,0 +1,151 @@
+use utils::lsn::Lsn;
+
+#[derive(Debug, PartialEq, Eq)]
+pub struct Partitioning {
+    pub keys: crate::keyspace::KeySpace,
+
+    pub at_lsn: Lsn,
+}
+
+impl serde::Serialize for Partitioning {
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        pub struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
+
+        impl<'a> serde::Serialize for KeySpace<'a> {
+            fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+            where
+                S: serde::Serializer,
+            {
+                use serde::ser::SerializeSeq;
+                let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?;
+                for kr in &self.0.ranges {
+                    seq.serialize_element(&KeyRange(kr))?;
+                }
+                seq.end()
+            }
+        }
+
+        use serde::ser::SerializeMap;
+        let mut map = serializer.serialize_map(Some(2))?;
+        map.serialize_key("keys")?;
+        map.serialize_value(&KeySpace(&self.keys))?;
+        map.serialize_key("at_lsn")?;
+        map.serialize_value(&WithDisplay(&self.at_lsn))?;
+        map.end()
+    }
+}
+
+pub struct WithDisplay<'a, T>(&'a T);
+
+impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        serializer.collect_str(&self.0)
+    }
+}
+
+pub struct KeyRange<'a>(&'a std::ops::Range<crate::key::Key>);
+
+impl<'a> serde::Serialize for KeyRange<'a> {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeTuple;
+        let mut t = serializer.serialize_tuple(2)?;
+        t.serialize_element(&WithDisplay(&self.0.start))?;
+        t.serialize_element(&WithDisplay(&self.0.end))?;
+        t.end()
+    }
+}
+
+impl<'a> serde::Deserialize<'a> for Partitioning {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'a>,
+    {
+        pub struct KeySpace(crate::keyspace::KeySpace);
+
+        impl<'de> serde::Deserialize<'de> for KeySpace {
+            fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                #[serde_with::serde_as]
+                #[derive(serde::Deserialize)]
+                #[serde(transparent)]
+                struct Key(#[serde_as(as = "serde_with::DisplayFromStr")] crate::key::Key);
+
+                #[serde_with::serde_as]
+                #[derive(serde::Deserialize)]
+                struct Range(Key, Key);
+
+                let ranges: Vec<Range> = serde::Deserialize::deserialize(deserializer)?;
+                Ok(Self(crate::keyspace::KeySpace {
+                    ranges: ranges
+                        .into_iter()
+                        .map(|Range(start, end)| (start.0..end.0))
+                        .collect(),
+                }))
+            }
+        }
+
+        #[serde_with::serde_as]
+        #[derive(serde::Deserialize)]
+        struct De {
+            keys: KeySpace,
+            #[serde_as(as = "serde_with::DisplayFromStr")]
+            at_lsn: Lsn,
+        }
+
+        let de: De = serde::Deserialize::deserialize(deserializer)?;
+        Ok(Self {
+            at_lsn: de.at_lsn,
+            keys: de.keys.0,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_serialization_roundtrip() {
+        let reference = r#"
+        {
+            "keys": [
+              [
+                "000000000000000000000000000000000000",
+                "000000000000000000000000000000000001"
+              ],
+              [
+                "000000067F00000001000000000000000000",
+                "000000067F00000001000000000000000002"
+              ],
+              [
+                "030000000000000000000000000000000000",
+                "030000000000000000000000000000000003"
+              ]
+            ],
+            "at_lsn": "0/2240160"
+        }
+        "#;
+
+        let de: Partitioning = serde_json::from_str(reference).unwrap();
+
+        let ser = serde_json::to_string(&de).unwrap();
+
+        let ser_de: serde_json::Value = serde_json::from_str(&ser).unwrap();
+
+        assert_eq!(
+            ser_de,
+            serde_json::from_str::<'_, serde_json::Value>(reference).unwrap()
+        );
+    }
+}
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 821c238082..77eb1bb8e2 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -92,6 +92,22 @@ impl Client {
             .map_err(Error::ReceiveBody)
     }
 
+    pub async fn keyspace(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<pageserver_api::models::partitioning::Partitioning> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace",
+            self.mgmt_api_endpoint
+        );
+        self.get(&uri)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
     async fn get<U: IntoUrl>(&self, uri: U) -> Result<reqwest::Response> {
         self.request(Method::GET, uri, ()).await
     }
diff --git a/pageserver/src/http/mod.rs b/pageserver/src/http/mod.rs
index 1c083bd382..c82d1c0362 100644
--- a/pageserver/src/http/mod.rs
+++ b/pageserver/src/http/mod.rs
@@ -1,4 +1,2 @@
 pub mod routes;
 pub use routes::make_router;
-
-pub use pageserver_api::models;
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 47092c3bb0..601fad5bde 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -28,10 +28,6 @@ use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
 
-use super::models::{
-    StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
-    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
-};
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
@@ -51,6 +47,10 @@ use crate::tenant::timeline::Timeline;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
+use pageserver_api::models::{
+    StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
+    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
+};
 use utils::{
     auth::SwappableJwtAuth,
     generation::Generation,
@@ -66,7 +66,7 @@ use utils::{
 };
 
 // Imports only used for testing APIs
-use super::models::ConfigureFailpointsRequest;
+use pageserver_api::models::ConfigureFailpointsRequest;
 
 // For APIs that require an Active tenant, how long should we block waiting for that state?
 // This is not functionally necessary (clients will retry), but avoids generating a lot of
@@ -1524,69 +1524,6 @@ async fn timeline_collect_keyspace(
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
-    struct Partitioning {
-        keys: crate::keyspace::KeySpace,
-
-        at_lsn: Lsn,
-    }
-
-    impl serde::Serialize for Partitioning {
-        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            use serde::ser::SerializeMap;
-            let mut map = serializer.serialize_map(Some(2))?;
-            map.serialize_key("keys")?;
-            map.serialize_value(&KeySpace(&self.keys))?;
-            map.serialize_key("at_lsn")?;
-            map.serialize_value(&WithDisplay(&self.at_lsn))?;
-            map.end()
-        }
-    }
-
-    struct WithDisplay<'a, T>(&'a T);
-
-    impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
-        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            serializer.collect_str(&self.0)
-        }
-    }
-
-    struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
-
-    impl<'a> serde::Serialize for KeySpace<'a> {
-        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            use serde::ser::SerializeSeq;
-            let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?;
-            for kr in &self.0.ranges {
-                seq.serialize_element(&KeyRange(kr))?;
-            }
-            seq.end()
-        }
-    }
-
-    struct KeyRange<'a>(&'a std::ops::Range<crate::repository::Key>);
-
-    impl<'a> serde::Serialize for KeyRange<'a> {
-        fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            use serde::ser::SerializeTuple;
-            let mut t = serializer.serialize_tuple(2)?;
-            t.serialize_element(&WithDisplay(&self.0.start))?;
-            t.serialize_element(&WithDisplay(&self.0.end))?;
-            t.end()
-        }
-    }
-
     let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;
 
     async {
@@ -1598,7 +1535,9 @@ async fn timeline_collect_keyspace(
             .await
             .map_err(|e| ApiError::InternalServerError(e.into()))?;
 
-        json_response(StatusCode::OK, Partitioning { keys, at_lsn })
+        let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn };
+
+        json_response(StatusCode::OK, res)
     }
     .instrument(info_span!("timeline_collect_keyspace", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
     .await
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 797cb6f944..58adf6e8c4 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -10,7 +10,7 @@ pub mod deletion_queue;
 pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
-pub mod keyspace;
+pub use pageserver_api::keyspace;
 pub mod metrics;
 pub mod page_cache;
 pub mod page_service;
diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs
index 97d731bf49..c726139524 100644
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -2,38 +2,11 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::Result;
 use bytes::Bytes;
 use serde::{Deserialize, Serialize};
-use std::ops::{AddAssign, Range};
+use std::ops::AddAssign;
 use std::time::Duration;
 
 pub use pageserver_api::key::{Key, KEY_SIZE};
 
-pub fn key_range_size(key_range: &Range<Key>) -> u32 {
-    let start = key_range.start;
-    let end = key_range.end;
-
-    if end.field1 != start.field1
-        || end.field2 != start.field2
-        || end.field3 != start.field3
-        || end.field4 != start.field4
-    {
-        return u32::MAX;
-    }
-
-    let start = (start.field5 as u64) << 32 | start.field6 as u64;
-    let end = (end.field5 as u64) << 32 | end.field6 as u64;
-
-    let diff = end - start;
-    if diff > u32::MAX as u64 {
-        u32::MAX
-    } else {
-        diff as u32
-    }
-}
-
-pub fn singleton_range(key: Key) -> Range<Key> {
-    key..key.next()
-}
-
 /// A 'value' stored for a one Key.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 #[cfg_attr(test, derive(PartialEq))]

From 9bdc25f0af272e15997a9103d56c08f8260fe72e Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Sat, 16 Dec 2023 12:32:00 +0000
Subject: [PATCH 15/19] Revert "CI: build build-tools image" (#6156)

It turns out the issue with skipped jobs is not so trivial (because
Github checks jobs transitively), a possible workaround with `if:
always() && contains(fromJSON('["success", "skipped"]'),
needs.build-buildtools-image.result)` will tangle the workflow really
bad. We'll need to come up with a better solution.

To unblock the main I'm going to revert
https://github.com/neondatabase/neon/pull/6082.
---
 .../workflows/build_and_push_docker_image.yml |  57 ------
 .github/workflows/build_and_test.yml          |  82 ++-------
 .../workflows/update_build_tools_image.yml    | 130 --------------
 .gitignore                                    |   1 -
 CONTRIBUTING.md                               |  14 --
 Dockerfile                                    |   2 +-
 Dockerfile.buildtools                         | 165 ------------------
 Dockerfile.compute-node                       |   2 +-
 Dockerfile.compute-tools                      |   2 +-
 9 files changed, 21 insertions(+), 434 deletions(-)
 delete mode 100644 .github/workflows/build_and_push_docker_image.yml
 delete mode 100644 .github/workflows/update_build_tools_image.yml
 delete mode 100644 Dockerfile.buildtools

diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml
deleted file mode 100644
index 8780c0fef7..0000000000
--- a/.github/workflows/build_and_push_docker_image.yml
+++ /dev/null
@@ -1,57 +0,0 @@
-name: Build and Push Docker Image
-
-on:
-  workflow_call:
-    inputs:
-      dockerfile-path:
-        required: true
-        type: string
-      image-name:
-        required: true
-        type: string
-      tag:
-        required: true
-        type: string
-
-jobs:
-  kaniko:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v1
-
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
-
-      - name: Kaniko build
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ inputs.tag }}-amd64
-
-  kaniko-arm:
-    runs-on: [ self-hosted, dev, arm64 ]
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v1
-
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
-
-      - name: Kaniko build
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ inputs.tag }}-arm64
-
-  manifest:
-    name: 'manifest'
-    runs-on: [ self-hosted, dev, x64 ]
-    needs:
-      - kaniko
-      - kaniko-arm
-
-    steps:
-      - name: Create manifest
-        run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ inputs.tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ inputs.tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ inputs.tag }}-arm64
-
-      - name: Push manifest
-        run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ inputs.tag }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 808d39b33c..693ed1a66f 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -44,13 +44,13 @@ jobs:
 
         exit 1
 
+
   tag:
     needs: [ check-permissions ]
     runs-on: [ self-hosted, gen3, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
     outputs:
       build-tag: ${{steps.build-tag.outputs.tag}}
-      build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
 
     steps:
       - name: Checkout
@@ -74,50 +74,11 @@ jobs:
         shell: bash
         id: build-tag
 
-      - name: Get buildtools tag
-        run: |
-          if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]]; then
-            IMAGE_TAG=$GITHUB_RUN_ID
-          else
-            IMAGE_TAG=pinned
-          fi
-          
-          echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
-        shell: bash
-        id: buildtools-tag
-
-  check-if-build-tools-dockerfile-changed:
-    needs: [ check-permissions ]
-    runs-on: ubuntu-latest
-    outputs:
-      docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
-    steps:
-      - name: Check if Dockerfile.buildtools has changed
-        id: dockerfile
-        run: |
-          updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
-          if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
-            echo "docker_file_changed=true" >> $GITHUB_OUTPUT
-          fi
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-  build-buildtools-image:
-    needs: [ check-permissions, check-if-build-tools-dockerfile-changed, tag]
-    uses: ./.github/workflows/build_and_push_docker_image.yml
-    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
-    with:
-      dockerfile-path: Dockerfile.buildtools
-      image-name: build-tools
-      tag: ${{ needs.tag.outputs.build-tools-tag}}
-    secrets: inherit
-
   check-codestyle-python:
-    if: always() && contains(fromJSON('["success", "skipped"]'), needs.build-buildtools-image.result)
-    needs: [ check-permissions, build-buildtools-image , tag ]
+    needs: [ check-permissions ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
 
     steps:
@@ -147,11 +108,10 @@ jobs:
         run: poetry run mypy .
 
   check-codestyle-rust:
-    if: always() && contains(fromJSON('["success", "skipped"]'), needs.build-buildtools-image.result)
-    needs: [ check-permissions, build-buildtools-image, tag ]
+    needs: [ check-permissions ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
 
     steps:
@@ -215,11 +175,10 @@ jobs:
         run: cargo deny check --hide-inclusion-graph
 
   build-neon:
-    if: always() && contains(fromJSON('["success", "skipped"]'), needs.build-buildtools-image.result)
-    needs: [ check-permissions, tag, build-buildtools-image ]
+    needs: [ check-permissions, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
     strategy:
       fail-fast: false
@@ -452,7 +411,7 @@ jobs:
     needs: [ check-permissions, build-neon, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       # Default shared memory is 64mb
       options: --init --shm-size=512mb
     strategy:
@@ -488,10 +447,10 @@ jobs:
         uses: ./.github/actions/save-coverage-data
 
   benchmarks:
-    needs: [ check-permissions, build-neon, tag]
+    needs: [ check-permissions, build-neon ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       # Default shared memory is 64mb
       options: --init --shm-size=512mb
     if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
@@ -520,12 +479,12 @@ jobs:
       # while coverage is currently collected for the debug ones
 
   create-test-report:
-    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, tag ]
+    needs: [ check-permissions, regress-tests, coverage-report, benchmarks ]
     if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
 
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
 
     steps:
@@ -567,10 +526,11 @@ jobs:
             })
 
   coverage-report:
-    needs: [ check-permissions, regress-tests, tag ]
+    needs: [ check-permissions, regress-tests ]
+
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.tag.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
     strategy:
       fail-fast: false
@@ -734,8 +694,7 @@ jobs:
             }"
 
   neon-image:
-    if: always() && contains(fromJSON('["success", "skipped"]'), needs.build-buildtools-image.result)
-    needs: [ check-permissions, build-buildtools-image, tag ]
+    needs: [ check-permissions, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container: gcr.io/kaniko-project/executor:v1.9.2-debug
     defaults:
@@ -774,7 +733,6 @@ jobs:
                            --context .
                            --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                            --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-                           --build-arg TAG=${{ needs.tag.outputs.build-tools-tag }}
                            --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
                            --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
@@ -784,9 +742,8 @@ jobs:
         run: rm -rf ~/.ecr
 
   compute-tools-image:
-    if: always() && contains(fromJSON('["success", "skipped"]'), needs.build-buildtools-image.result)
     runs-on: [ self-hosted, gen3, large ]
-    needs: [ check-permissions, build-buildtools-image, tag ]
+    needs: [ check-permissions, tag ]
     container: gcr.io/kaniko-project/executor:v1.9.2-debug
     defaults:
       run:
@@ -821,7 +778,6 @@ jobs:
                            --context .
                            --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                            --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
-                           --build-arg TAG=${{needs.tag.outputs.build-tools-tag}}
                            --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                            --dockerfile Dockerfile.compute-tools
                            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
@@ -832,8 +788,7 @@ jobs:
         run: rm -rf ~/.ecr
 
   compute-node-image:
-    if: always() && contains(fromJSON('["success", "skipped"]'), needs.build-buildtools-image.result)
-    needs: [ check-permissions, build-buildtools-image, tag ]
+    needs: [ check-permissions, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container:
       image: gcr.io/kaniko-project/executor:v1.9.2-debug
@@ -881,7 +836,6 @@ jobs:
                            --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                            --build-arg PG_VERSION=${{ matrix.version }}
                            --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
-                           --build-arg TAG=${{needs.tag.outputs.build-tools-tag}}
                            --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                            --dockerfile Dockerfile.compute-node
                            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
diff --git a/.github/workflows/update_build_tools_image.yml b/.github/workflows/update_build_tools_image.yml
deleted file mode 100644
index 88bab797b7..0000000000
--- a/.github/workflows/update_build_tools_image.yml
+++ /dev/null
@@ -1,130 +0,0 @@
-name: 'Update build tools image tag'
-
-# This workflow it used to update tag of build tools in ECR.
-# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
-
-on:
-  workflow_dispatch:
-    inputs:
-      from-tag:
-        description: 'Source tag'
-        required: true
-        type: string
-      to-tag:
-        description: 'Destination tag'
-        required: true
-        type: string
-        default: 'pinned'
-
-defaults:
-  run:
-    shell: bash -euo pipefail {0}
-
-env:
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-permissions: {}
-
-jobs:
-  tag-image:
-    runs-on: [ self-hosted, gen3, small ]
-    container: golang:1.19-bullseye
-
-    env:
-      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
-      FROM_TAG: ${{ inputs.from-tag }}
-      TO_TAG: ${{ inputs.to-tag }}
-    outputs:
-      next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
-      prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
-
-    steps:
-      - name: Install Crane & ECR helper
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
-
-      - name: Configure ECR login
-        run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
-
-      - name: Get source image digest
-        id: next-digest
-        run: |
-          NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
-          if [ -z "${NEXT_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
-            exit 1
-          fi
-
-          echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
-          echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
-
-      - name: Get destination image digest (if already exists)
-        id: prev-digest
-        run: |
-          PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
-          if [ -z "${PREV_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
-          else
-            echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
-
-            echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Tag image
-        run: |
-          crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
-
-  rollback-tag-image:
-    needs:  tag-image
-    if: ${{ !success() }}
-
-    runs-on: [ self-hosted, gen3, small ]
-    container: golang:1.19-bullseye
-
-    env:
-      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
-      FROM_TAG: ${{ inputs.from-tag }}
-      TO_TAG: ${{ inputs.to-tag }}
-
-    steps:
-      - name: Install Crane & ECR helper
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
-
-      - name: Configure ECR login
-        run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
-
-      - name: Restore previous tag if needed
-        run: |
-          NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
-          PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
-
-          if [ -z "${NEXT_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
-            exit 0
-          fi
-
-          if [ -z "${PREV_DIGEST}" ]; then
-            # I guess we should delete the tag here/untag the image, but crane does not support it
-            # - https://github.com/google/go-containerregistry/issues/999
-
-            echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
-
-            exit 0
-          fi
-
-          CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
-          if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
-            crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
-
-            echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
-          else
-            echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
-          fi
diff --git a/.gitignore b/.gitignore
index 3f4495c9e7..c5fc121ac2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,7 +6,6 @@ __pycache__/
 test_output/
 .vscode
 .idea
-neon.iml
 /.neon
 /integration_tests/.neon
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b318c295a3..2692684006 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -70,17 +70,3 @@ We're using the following approach to make it work:
 - The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review)
 
 For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
-
-## How do I add the "pinned" tag to an buildtools image?
-We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.
-
-You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
-or using GitHub CLI:
-
-```bash
-gh workflow -R neondatabase/neon run update_build_tools_image.yml \
-            -f from-tag=6254913013 \
-            -f to-tag=pinned \
-
-# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
-```
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 5d5fde4f14..60de9cfa3e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,7 +3,7 @@
 ### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used
 ### inside this image in the real deployments.
 ARG REPOSITORY=neondatabase
-ARG IMAGE=build-tools
+ARG IMAGE=rust
 ARG TAG=pinned
 
 # Build Postgres
diff --git a/Dockerfile.buildtools b/Dockerfile.buildtools
deleted file mode 100644
index d3d05b4e20..0000000000
--- a/Dockerfile.buildtools
+++ /dev/null
@@ -1,165 +0,0 @@
-FROM debian:bullseye-slim
-
-# Add nonroot user
-RUN useradd -ms /bin/bash nonroot -b /home
-SHELL ["/bin/bash", "-c"]
-
-# System deps
-RUN set -e \
-    && apt update \
-    && apt install -y \
-        autoconf \
-        automake \
-        bison \
-        build-essential \
-        ca-certificates \
-        cmake \
-        curl \
-        flex \
-        git \
-        gnupg \
-        gzip \
-        jq \
-        libcurl4-openssl-dev \
-        libbz2-dev \
-        libffi-dev \
-        liblzma-dev \
-        libncurses5-dev \
-        libncursesw5-dev \
-        libpq-dev \
-        libreadline-dev \
-        libseccomp-dev \
-        libsqlite3-dev \
-        libssl-dev \
-        libstdc++-10-dev \
-        libtool \
-        libxml2-dev \
-        libxmlsec1-dev \
-        libxxhash-dev \
-        lsof \
-        make \
-        netcat \
-        net-tools \
-        openssh-client \
-        parallel \
-        pkg-config \
-        unzip \
-        wget \
-        xz-utils \
-        zlib1g-dev \
-        zstd \
-    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-# protobuf-compiler (protoc)
-ENV PROTOC_VERSION 22.2
-RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
-    && unzip -q protoc.zip -d protoc \
-    && mv protoc/bin/protoc /usr/local/bin/protoc \
-    && mv protoc/include/google /usr/local/include/google \
-    && rm -rf protoc.zip protoc
-
-# LLVM
-ENV LLVM_VERSION=17
-RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
-    && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
-    && apt update \
-    && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
-    && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
-    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-# PostgreSQL 14
-RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \
-    && echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \
-    && apt update \
-    && apt install -y postgresql-client-14 \
-    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-# AWS CLI
-RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
-    && unzip -q awscliv2.zip \
-    && ./aws/install \
-    && rm awscliv2.zip
-
-# Mold: A Modern Linker
-ENV MOLD_VERSION v2.1.0
-RUN set -e \
-    && git clone https://github.com/rui314/mold.git \
-    && mkdir mold/build \
-    && cd mold/build \
-    && git checkout ${MOLD_VERSION} \
-    && cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang++ .. \
-    && cmake --build . -j $(nproc) \
-    && cmake --install . \
-    && cd .. \
-    && rm -rf mold
-
-# LCOV
-# Build lcov from a fork:
-# It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
-# And patches from us:
-# - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
-RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
-    && wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
-    && echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992  lcov.tar.gz" | sha256sum --check \
-    && mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
-    && cd lcov \
-    && make install \
-    && rm -rf ../lcov.tar.gz
-
-# Switch to nonroot user
-USER nonroot:nonroot
-WORKDIR /home/nonroot
-
-# Python
-ENV PYTHON_VERSION=3.9.2 \
-    PYENV_ROOT=/home/nonroot/.pyenv \
-    PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
-RUN set -e \
-    && cd $HOME \
-    && curl -sSO https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer \
-    && chmod +x pyenv-installer \
-    && ./pyenv-installer \
-    && export PYENV_ROOT=/home/nonroot/.pyenv \
-    && export PATH="$PYENV_ROOT/bin:$PATH" \
-    && export PATH="$PYENV_ROOT/shims:$PATH" \
-    && pyenv install ${PYTHON_VERSION} \
-    && pyenv global ${PYTHON_VERSION} \
-    && python --version \
-    && pip install --upgrade pip \
-    && pip --version \
-    && pip install pipenv wheel poetry
-
-# Switch to nonroot user (again)
-USER nonroot:nonroot
-WORKDIR /home/nonroot
-
-# Rust
-# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.74.0
-ENV RUSTUP_HOME="/home/nonroot/.rustup"
-ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
-RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
-	chmod +x rustup-init && \
-	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
-	rm rustup-init && \
-    export PATH="$HOME/.cargo/bin:$PATH" && \
-    . "$HOME/.cargo/env" && \
-    cargo --version && rustup --version && \
-    rustup component add llvm-tools-preview rustfmt clippy && \
-    cargo install --git https://github.com/paritytech/cachepot && \
-    cargo install rustfilt && \
-    cargo install cargo-hakari && \
-    cargo install cargo-deny && \
-    cargo install cargo-hack && \
-    rm -rf /home/nonroot/.cargo/registry && \
-    rm -rf /home/nonroot/.cargo/git
-ENV RUSTC_WRAPPER=cachepot
-
-# Show versions
-RUN whoami \
-    && python --version \
-    && pip --version \
-    && cargo --version --verbose \
-    && rustup --version --verbose \
-    && rustc --version --verbose \
-    && clang --version
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 3809eb09a9..03280586f8 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -1,6 +1,6 @@
 ARG PG_VERSION
 ARG REPOSITORY=neondatabase
-ARG IMAGE=build-tools
+ARG IMAGE=rust
 ARG TAG=pinned
 ARG BUILD_TAG
 
diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools
index cc305cc556..3066e3f7ca 100644
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -1,7 +1,7 @@
 # First transient image to build compute_tools binaries
 # NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
 ARG REPOSITORY=neondatabase
-ARG IMAGE=build-tools
+ARG IMAGE=rust
 ARG TAG=pinned
 ARG BUILD_TAG
 

From e1935f42a1d1de4948fa6f689f2fe4cd83d15549 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 27 Nov 2023 15:40:01 +0300
Subject: [PATCH 16/19] Don't generate core dump when walproposer intentionally
 panics.

Walproposer sometimes intentionally PANICs when its term is defeated as the
basebackup is likely spoiled by that time. We don't want core dumped in this
case.
---
 pgxn/neon/neon_utils.c  | 25 +++++++++++++++++++++++++
 pgxn/neon/neon_utils.h  |  1 +
 pgxn/neon/walproposer.c | 15 ++++++++++++++-
 3 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c
index 06faea7490..807d2decf6 100644
--- a/pgxn/neon/neon_utils.c
+++ b/pgxn/neon/neon_utils.c
@@ -1,3 +1,6 @@
+
+#include <sys/resource.h>
+
 #include "postgres.h"
 
 #include "access/timeline.h"
@@ -114,3 +117,25 @@ pq_sendint64_le(StringInfo buf, uint64 i)
 	memcpy(buf->data + buf->len, &i, sizeof(uint64));
 	buf->len += sizeof(uint64);
 }
+
+/*
+ * Disables core dump for the current process.
+ */
+void
+disable_core_dump()
+{
+	struct rlimit rlim;
+
+#ifdef WALPROPOSER_LIB			/* skip in simulation mode */
+	return;
+#endif
+
+	rlim.rlim_cur = 0;
+	rlim.rlim_max = 0;
+	if (setrlimit(RLIMIT_CORE, &rlim))
+	{
+		int			save_errno = errno;
+
+		fprintf(stderr, "WARNING: disable cores setrlimit failed: %s", strerror(save_errno));
+	}
+}
diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h
index e3fafc8d0f..20745d8b26 100644
--- a/pgxn/neon/neon_utils.h
+++ b/pgxn/neon/neon_utils.h
@@ -8,5 +8,6 @@ uint32		pq_getmsgint32_le(StringInfo msg);
 uint64		pq_getmsgint64_le(StringInfo msg);
 void		pq_sendint32_le(StringInfo buf, uint32 i);
 void		pq_sendint64_le(StringInfo buf, uint64 i);
+extern void disable_core_dump();
 
 #endif							/* __NEON_UTILS_H__ */
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 7d9dbfdb7f..fc3332612c 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -35,6 +35,8 @@
  *
  *-------------------------------------------------------------------------
  */
+#include <sys/resource.h>
+
 #include "postgres.h"
 #include "libpq/pqformat.h"
 #include "neon.h"
@@ -1069,6 +1071,12 @@ DetermineEpochStartLsn(WalProposer *wp)
 			if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
 											walprop_shared->mineLastElectedTerm)))
 			{
+				/*
+				 * Panic to restart PG as we need to retake basebackup.
+				 * However, don't dump core as this is kinda expected
+				 * scenario.
+				 */
+				disable_core_dump();
 				walprop_log(PANIC,
 							"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
 							LSN_FORMAT_ARGS(wp->propEpochStartLsn),
@@ -1445,7 +1453,12 @@ RecvAppendResponses(Safekeeper *sk)
 
 		if (sk->appendResponse.term > wp->propTerm)
 		{
-			/* Another compute with higher term is running. */
+			/*
+			 * Another compute with higher term is running. Panic to restart
+			 * PG as we likely need to retake basebackup. However, don't dump
+			 * core as this is kinda expected scenario.
+			 */
+			disable_core_dump();
 			walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
 						sk->host, sk->port,
 						sk->appendResponse.term, wp->propTerm);

From dbdb1d21f2eae8e9616dd12656868fcdb59c603e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 18 Dec 2023 10:29:19 +0000
Subject: [PATCH 17/19] pageserver: on-demand activation cleanups (#6157)

## Problem

#6112 added some logs and metrics: clean these up a bit:
- Avoid counting startup completions for tenants launched after startup
- exclude no-op cases from timing histograms
- remove a rogue log messages
---
 pageserver/src/tenant.rs | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1478a1a445..eceef6bf78 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -629,9 +629,12 @@ impl Tenant {
             "attach tenant",
             false,
             async move {
+                // Is this tenant being spawned as part of process startup?
+                let starting_up = init_order.is_some();
                 scopeguard::defer! {
-                    tracing::info!("Increment complete count");
-                    TENANT.startup_complete.inc();
+                    if starting_up {
+                        TENANT.startup_complete.inc();
+                    }
                 }
 
                 // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
@@ -711,7 +714,11 @@ impl Tenant {
 
                 let preload_timer = TENANT.preload.start_timer();
                 let preload = match mode {
-                    SpawnMode::Create => {None},
+                    SpawnMode::Create => {
+                        // Don't count the skipped preload into the histogram of preload durations
+                        preload_timer.stop_and_discard();
+                        None
+                    },
                     SpawnMode::Normal => {
                         match &remote_storage {
                             Some(remote_storage) => Some(
@@ -721,7 +728,11 @@ impl Tenant {
                                         tracing::info_span!(parent: None, "attach_preload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()),
                                     )
                                     .await {
-                                        Ok(p) => p,
+                                        Ok(p) => {
+                                            preload_timer.observe_duration();
+                                            p
+                                        }
+                                            ,
                                         Err(e) => {
                                             make_broken(&tenant_clone, anyhow::anyhow!(e));
                                                 return Ok(());
@@ -732,7 +743,6 @@ impl Tenant {
                         }
                     }
                 };
-                preload_timer.observe_duration();
 
                 // Remote preload is complete.
                 drop(remote_load_completion);
@@ -784,15 +794,19 @@ impl Tenant {
                     }
                 }
 
-                let attach_timer = TENANT.attach.start_timer();
+                // We will time the duration of the attach phase unless this is a creation (attach will do no work)
+                let attach_timer = match mode {
+                    SpawnMode::Create => None,
+                    SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
+                };
                 match tenant_clone.attach(preload, &ctx).await {
                     Ok(()) => {
                         info!("attach finished, activating");
-                        attach_timer.observe_duration();
+                        if let Some(t)=  attach_timer {t.observe_duration();}
                         tenant_clone.activate(broker_client, None, &ctx);
                     }
                     Err(e) => {
-                        attach_timer.observe_duration();
+                        if let Some(t)=  attach_timer {t.observe_duration();}
                         make_broken(&tenant_clone, anyhow::anyhow!(e));
                     }
                 }

From 17bde7eda516302a57f46bd95140e8d89e68af73 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 18 Dec 2023 10:59:49 +0000
Subject: [PATCH 18/19] proxy refactor large files (#6153)

## Problem

The `src/proxy.rs` file is far too large

## Summary of changes

Creates 3 new files:
```
src/metrics.rs
src/proxy/retry.rs
src/proxy/connect_compute.rs
```
---
 proxy/src/auth/backend.rs             |   4 +-
 proxy/src/auth/backend/classic.rs     |   2 +-
 proxy/src/auth/backend/hacks.rs       |   2 +-
 proxy/src/auth/credentials.rs         |   5 +-
 proxy/src/compute.rs                  |   7 +-
 proxy/src/console/provider.rs         |   2 +-
 proxy/src/console/provider/neon.rs    |   2 +-
 proxy/src/http.rs                     |   2 +-
 proxy/src/lib.rs                      |   1 +
 proxy/src/metrics.rs                  | 232 +++++++++++
 proxy/src/proxy.rs                    | 542 +-------------------------
 proxy/src/proxy/connect_compute.rs    | 238 +++++++++++
 proxy/src/proxy/retry.rs              |  68 ++++
 proxy/src/proxy/tests.rs              |   5 +-
 proxy/src/proxy/tests/mitm.rs         |   4 +-
 proxy/src/rate_limiter/limiter.rs     |   6 +-
 proxy/src/serverless.rs               |   2 +-
 proxy/src/serverless/conn_pool.rs     |   7 +-
 proxy/src/serverless/sql_over_http.rs |   2 +-
 19 files changed, 579 insertions(+), 554 deletions(-)
 create mode 100644 proxy/src/metrics.rs
 create mode 100644 proxy/src/proxy/connect_compute.rs
 create mode 100644 proxy/src/proxy/retry.rs

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index ba054b53eb..3b09e05bd2 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -11,7 +11,8 @@ use crate::auth::validate_password_and_exchange;
 use crate::console::errors::GetAuthInfoError;
 use crate::console::provider::AuthInfo;
 use crate::console::AuthSecret;
-use crate::proxy::{handle_try_wake, retry_after, LatencyTimer};
+use crate::proxy::connect_compute::handle_try_wake;
+use crate::proxy::retry::retry_after;
 use crate::scram;
 use crate::stream::Stream;
 use crate::{
@@ -22,6 +23,7 @@ use crate::{
         provider::{CachedNodeInfo, ConsoleReqExtra},
         Api,
     },
+    metrics::LatencyTimer,
     stream, url,
 };
 use futures::TryFutureExt;
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index ce52daf16c..5c394ec649 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -4,7 +4,7 @@ use crate::{
     compute,
     config::AuthenticationConfig,
     console::AuthSecret,
-    proxy::LatencyTimer,
+    metrics::LatencyTimer,
     sasl,
     stream::{PqStream, Stream},
 };
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index abbd25008b..5dde514bca 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -4,7 +4,7 @@ use super::{
 use crate::{
     auth::{self, AuthFlow},
     console::AuthSecret,
-    proxy::LatencyTimer,
+    metrics::LatencyTimer,
     sasl,
     stream::{self, Stream},
 };
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 72149e8e29..c04769a199 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -1,9 +1,8 @@
 //! User credentials used in authentication.
 
 use crate::{
-    auth::password_hack::parse_endpoint_param,
-    error::UserFacingError,
-    proxy::{neon_options_str, NUM_CONNECTION_ACCEPTED_BY_SNI},
+    auth::password_hack::parse_endpoint_param, error::UserFacingError,
+    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::neon_options_str,
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index f5f7270bf4..a54ba56e43 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,9 +1,6 @@
 use crate::{
-    auth::parse_endpoint_param,
-    cancellation::CancelClosure,
-    console::errors::WakeComputeError,
-    error::UserFacingError,
-    proxy::{neon_option, NUM_DB_CONNECTIONS_GAUGE},
+    auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError,
+    error::UserFacingError, metrics::NUM_DB_CONNECTIONS_GAUGE, proxy::neon_option,
 };
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index deab966d9e..8d399f26ea 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -21,7 +21,7 @@ pub mod errors {
     use crate::{
         error::{io_error, UserFacingError},
         http,
-        proxy::ShouldRetry,
+        proxy::retry::ShouldRetry,
     };
     use thiserror::Error;
 
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 192252a0df..f748c9a41f 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -5,7 +5,7 @@ use super::{
     errors::{ApiError, GetAuthInfoError, WakeComputeError},
     ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
 };
-use crate::proxy::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER};
+use crate::metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER};
 use crate::{auth::backend::ComputeUserInfo, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index 09423eca77..59e1492ed4 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -13,7 +13,7 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio::time::Instant;
 use tracing::trace;
 
-use crate::{proxy::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl};
+use crate::{metrics::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl};
 use reqwest_middleware::RequestBuilder;
 
 /// This is the preferred way to create new http clients,
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index a22600cbb3..2da1eaf482 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -16,6 +16,7 @@ pub mod console;
 pub mod error;
 pub mod http;
 pub mod logging;
+pub mod metrics;
 pub mod parse;
 pub mod protocol2;
 pub mod proxy;
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
new file mode 100644
index 0000000000..8e2a6105b1
--- /dev/null
+++ b/proxy/src/metrics.rs
@@ -0,0 +1,232 @@
+use ::metrics::{
+    exponential_buckets, register_int_counter_pair_vec, register_int_counter_vec,
+    IntCounterPairVec, IntCounterVec,
+};
+use prometheus::{
+    register_histogram, register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec,
+    IntGaugeVec,
+};
+
+use once_cell::sync::Lazy;
+use tokio::time;
+
+pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
+        "proxy_opened_db_connections_total",
+        "Number of opened connections to a database.",
+        "proxy_closed_db_connections_total",
+        "Number of closed connections to a database.",
+        &["protocol"],
+    )
+    .unwrap()
+});
+
+pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
+        "proxy_opened_client_connections_total",
+        "Number of opened connections from a client.",
+        "proxy_closed_client_connections_total",
+        "Number of closed connections from a client.",
+        &["protocol"],
+    )
+    .unwrap()
+});
+
+pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
+        "proxy_accepted_connections_total",
+        "Number of client connections accepted.",
+        "proxy_closed_connections_total",
+        "Number of client connections closed.",
+        &["protocol"],
+    )
+    .unwrap()
+});
+
+pub static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "proxy_compute_connection_latency_seconds",
+        "Time it took for proxy to establish a connection to the compute endpoint",
+        // http/ws/tcp, true/false, true/false, success/failure
+        // 3 * 2 * 2 * 2 = 24 counters
+        &["protocol", "cache_miss", "pool_miss", "outcome"],
+        // largest bucket = 2^16 * 0.5ms = 32s
+        exponential_buckets(0.0005, 2.0, 16).unwrap(),
+    )
+    .unwrap()
+});
+
+pub static CONSOLE_REQUEST_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "proxy_console_request_latency",
+        "Time it took for proxy to establish a connection to the compute endpoint",
+        // proxy_wake_compute/proxy_get_role_info
+        &["request"],
+        // largest bucket = 2^16 * 0.2ms = 13s
+        exponential_buckets(0.0002, 2.0, 16).unwrap(),
+    )
+    .unwrap()
+});
+
+pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_allowed_ips_cache_misses",
+        "Number of cache hits/misses for allowed ips",
+        // hit/miss
+        &["outcome"],
+    )
+    .unwrap()
+});
+
+pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "proxy_control_plane_token_acquire_seconds",
+        "Time it took for proxy to establish a connection to the compute endpoint",
+        // largest bucket = 3^16 * 0.05ms = 2.15s
+        exponential_buckets(0.00005, 3.0, 16).unwrap(),
+    )
+    .unwrap()
+});
+
+pub static RATE_LIMITER_LIMIT: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "semaphore_control_plane_limit",
+        "Current limit of the semaphore control plane",
+        &["limit"], // 2 counters
+    )
+    .unwrap()
+});
+
+pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_accepted_connections_by_sni",
+        "Number of connections (per sni).",
+        &["kind"],
+    )
+    .unwrap()
+});
+
+pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "proxy_allowed_ips_number",
+        "Number of allowed ips",
+        vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0],
+    )
+    .unwrap()
+});
+
+pub struct LatencyTimer {
+    // time since the stopwatch was started
+    start: Option<time::Instant>,
+    // accumulated time on the stopwatch
+    accumulated: std::time::Duration,
+    // label data
+    protocol: &'static str,
+    cache_miss: bool,
+    pool_miss: bool,
+    outcome: &'static str,
+}
+
+pub struct LatencyTimerPause<'a> {
+    timer: &'a mut LatencyTimer,
+}
+
+impl LatencyTimer {
+    pub fn new(protocol: &'static str) -> Self {
+        Self {
+            start: Some(time::Instant::now()),
+            accumulated: std::time::Duration::ZERO,
+            protocol,
+            cache_miss: false,
+            // by default we don't do pooling
+            pool_miss: true,
+            // assume failed unless otherwise specified
+            outcome: "failed",
+        }
+    }
+
+    pub fn pause(&mut self) -> LatencyTimerPause<'_> {
+        // stop the stopwatch and record the time that we have accumulated
+        let start = self.start.take().expect("latency timer should be started");
+        self.accumulated += start.elapsed();
+        LatencyTimerPause { timer: self }
+    }
+
+    pub fn cache_miss(&mut self) {
+        self.cache_miss = true;
+    }
+
+    pub fn pool_hit(&mut self) {
+        self.pool_miss = false;
+    }
+
+    pub fn success(mut self) {
+        self.outcome = "success";
+    }
+}
+
+impl Drop for LatencyTimerPause<'_> {
+    fn drop(&mut self) {
+        // start the stopwatch again
+        self.timer.start = Some(time::Instant::now());
+    }
+}
+
+impl Drop for LatencyTimer {
+    fn drop(&mut self) {
+        let duration =
+            self.start.map(|start| start.elapsed()).unwrap_or_default() + self.accumulated;
+        COMPUTE_CONNECTION_LATENCY
+            .with_label_values(&[
+                self.protocol,
+                bool_to_str(self.cache_miss),
+                bool_to_str(self.pool_miss),
+                self.outcome,
+            ])
+            .observe(duration.as_secs_f64())
+    }
+}
+
+pub static NUM_CONNECTION_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_connection_failures_total",
+        "Number of connection failures (per kind).",
+        &["kind"],
+    )
+    .unwrap()
+});
+
+pub static NUM_WAKEUP_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_connection_failures_breakdown",
+        "Number of wake-up failures (per kind).",
+        &["retry", "kind"],
+    )
+    .unwrap()
+});
+
+pub static NUM_BYTES_PROXIED_PER_CLIENT_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_io_bytes_per_client",
+        "Number of bytes sent/received between client and backend.",
+        crate::console::messages::MetricsAuxInfo::TRAFFIC_LABELS,
+    )
+    .unwrap()
+});
+
+pub static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_io_bytes",
+        "Number of bytes sent/received between all clients and backends.",
+        &["direction"],
+    )
+    .unwrap()
+});
+
+pub const fn bool_to_str(x: bool) -> &'static str {
+    if x {
+        "true"
+    } else {
+        "false"
+    }
+}
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index da65065179..17e910860c 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -1,265 +1,41 @@
 #[cfg(test)]
 mod tests;
 
+pub mod connect_compute;
+pub mod retry;
+
 use crate::{
     auth,
     cancellation::{self, CancelMap},
-    compute::{self, PostgresConnection},
+    compute,
     config::{AuthenticationConfig, ProxyConfig, TlsConfig},
-    console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
-    http::StatusCode,
+    console::{self, messages::MetricsAuxInfo},
+    metrics::{
+        LatencyTimer, NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER,
+        NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE,
+    },
     protocol2::WithClientIp,
     rate_limiter::EndpointRateLimiter,
     stream::{PqStream, Stream},
     usage_metrics::{Ids, USAGE_METRICS},
 };
 use anyhow::{bail, Context};
-use async_trait::async_trait;
 use futures::TryFutureExt;
 use itertools::Itertools;
-use metrics::{
-    exponential_buckets, register_int_counter_pair_vec, register_int_counter_vec,
-    IntCounterPairVec, IntCounterVec,
-};
-use once_cell::sync::{Lazy, OnceCell};
+use once_cell::sync::OnceCell;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
-use prometheus::{
-    register_histogram, register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec,
-    IntGaugeVec,
-};
 use regex::Regex;
-use std::{error::Error, io, net::IpAddr, ops::ControlFlow, sync::Arc, time::Instant};
-use tokio::{
-    io::{AsyncRead, AsyncWrite, AsyncWriteExt},
-    time,
-};
+use std::{net::IpAddr, sync::Arc};
+use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, info_span, warn, Instrument};
+use tracing::{error, info, info_span, Instrument};
 use utils::measured_stream::MeasuredStream;
 
-/// Number of times we should retry the `/proxy_wake_compute` http request.
-/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0
-pub const NUM_RETRIES_CONNECT: u32 = 16;
-const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
-const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25);
-const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2;
+use self::connect_compute::{connect_to_compute, TcpMechanism};
 
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 const ERR_PROTO_VIOLATION: &str = "protocol violation";
 
-pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "proxy_opened_db_connections_total",
-        "Number of opened connections to a database.",
-        "proxy_closed_db_connections_total",
-        "Number of closed connections to a database.",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "proxy_opened_client_connections_total",
-        "Number of opened connections from a client.",
-        "proxy_closed_client_connections_total",
-        "Number of closed connections from a client.",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "proxy_accepted_connections_total",
-        "Number of client connections accepted.",
-        "proxy_closed_connections_total",
-        "Number of client connections closed.",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "proxy_compute_connection_latency_seconds",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // http/ws/tcp, true/false, true/false, success/failure
-        // 3 * 2 * 2 * 2 = 24 counters
-        &["protocol", "cache_miss", "pool_miss", "outcome"],
-        // largest bucket = 2^16 * 0.5ms = 32s
-        exponential_buckets(0.0005, 2.0, 16).unwrap(),
-    )
-    .unwrap()
-});
-
-pub static CONSOLE_REQUEST_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "proxy_console_request_latency",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // proxy_wake_compute/proxy_get_role_info
-        &["request"],
-        // largest bucket = 2^16 * 0.2ms = 13s
-        exponential_buckets(0.0002, 2.0, 16).unwrap(),
-    )
-    .unwrap()
-});
-
-pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_allowed_ips_cache_misses",
-        "Number of cache hits/misses for allowed ips",
-        // hit/miss
-        &["outcome"],
-    )
-    .unwrap()
-});
-
-pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_control_plane_token_acquire_seconds",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // largest bucket = 3^16 * 0.05ms = 2.15s
-        exponential_buckets(0.00005, 3.0, 16).unwrap(),
-    )
-    .unwrap()
-});
-
-pub static RATE_LIMITER_LIMIT: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
-        "semaphore_control_plane_limit",
-        "Current limit of the semaphore control plane",
-        &["limit"], // 2 counters
-    )
-    .unwrap()
-});
-
-pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_accepted_connections_by_sni",
-        "Number of connections (per sni).",
-        &["kind"],
-    )
-    .unwrap()
-});
-
-pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_allowed_ips_number",
-        "Number of allowed ips",
-        vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0],
-    )
-    .unwrap()
-});
-
-pub struct LatencyTimer {
-    // time since the stopwatch was started
-    start: Option<Instant>,
-    // accumulated time on the stopwatch
-    accumulated: std::time::Duration,
-    // label data
-    protocol: &'static str,
-    cache_miss: bool,
-    pool_miss: bool,
-    outcome: &'static str,
-}
-
-pub struct LatencyTimerPause<'a> {
-    timer: &'a mut LatencyTimer,
-}
-
-impl LatencyTimer {
-    pub fn new(protocol: &'static str) -> Self {
-        Self {
-            start: Some(Instant::now()),
-            accumulated: std::time::Duration::ZERO,
-            protocol,
-            cache_miss: false,
-            // by default we don't do pooling
-            pool_miss: true,
-            // assume failed unless otherwise specified
-            outcome: "failed",
-        }
-    }
-
-    pub fn pause(&mut self) -> LatencyTimerPause<'_> {
-        // stop the stopwatch and record the time that we have accumulated
-        let start = self.start.take().expect("latency timer should be started");
-        self.accumulated += start.elapsed();
-        LatencyTimerPause { timer: self }
-    }
-
-    pub fn cache_miss(&mut self) {
-        self.cache_miss = true;
-    }
-
-    pub fn pool_hit(&mut self) {
-        self.pool_miss = false;
-    }
-
-    pub fn success(mut self) {
-        self.outcome = "success";
-    }
-}
-
-impl Drop for LatencyTimerPause<'_> {
-    fn drop(&mut self) {
-        // start the stopwatch again
-        self.timer.start = Some(Instant::now());
-    }
-}
-
-impl Drop for LatencyTimer {
-    fn drop(&mut self) {
-        let duration =
-            self.start.map(|start| start.elapsed()).unwrap_or_default() + self.accumulated;
-        COMPUTE_CONNECTION_LATENCY
-            .with_label_values(&[
-                self.protocol,
-                bool_to_str(self.cache_miss),
-                bool_to_str(self.pool_miss),
-                self.outcome,
-            ])
-            .observe(duration.as_secs_f64())
-    }
-}
-
-static NUM_CONNECTION_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_connection_failures_total",
-        "Number of connection failures (per kind).",
-        &["kind"],
-    )
-    .unwrap()
-});
-
-static NUM_WAKEUP_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_connection_failures_breakdown",
-        "Number of wake-up failures (per kind).",
-        &["retry", "kind"],
-    )
-    .unwrap()
-});
-
-static NUM_BYTES_PROXIED_PER_CLIENT_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_io_bytes_per_client",
-        "Number of bytes sent/received between client and backend.",
-        crate::console::messages::MetricsAuxInfo::TRAFFIC_LABELS,
-    )
-    .unwrap()
-});
-
-static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_io_bytes",
-        "Number of bytes sent/received between all clients and backends.",
-        &["direction"],
-    )
-    .unwrap()
-});
-
 pub async fn run_until_cancelled<F: std::future::Future>(
     f: F,
     cancellation_token: &CancellationToken,
@@ -539,296 +315,6 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
     }
 }
 
-/// If we couldn't connect, a cached connection info might be to blame
-/// (e.g. the compute node's address might've changed at the wrong time).
-/// Invalidate the cache entry (if any) to prevent subsequent errors.
-#[tracing::instrument(name = "invalidate_cache", skip_all)]
-pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg {
-    let is_cached = node_info.cached();
-    if is_cached {
-        warn!("invalidating stalled compute node info cache entry");
-    }
-    let label = match is_cached {
-        true => "compute_cached",
-        false => "compute_uncached",
-    };
-    NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
-
-    node_info.invalidate().config
-}
-
-/// Try to connect to the compute node once.
-#[tracing::instrument(name = "connect_once", fields(pid = tracing::field::Empty), skip_all)]
-async fn connect_to_compute_once(
-    node_info: &console::CachedNodeInfo,
-    timeout: time::Duration,
-    proto: &'static str,
-) -> Result<PostgresConnection, compute::ConnectionError> {
-    let allow_self_signed_compute = node_info.allow_self_signed_compute;
-
-    node_info
-        .config
-        .connect(allow_self_signed_compute, timeout, proto)
-        .await
-}
-
-#[async_trait]
-pub trait ConnectMechanism {
-    type Connection;
-    type ConnectError;
-    type Error: From<Self::ConnectError>;
-    async fn connect_once(
-        &self,
-        node_info: &console::CachedNodeInfo,
-        timeout: time::Duration,
-    ) -> Result<Self::Connection, Self::ConnectError>;
-
-    fn update_connect_config(&self, conf: &mut compute::ConnCfg);
-}
-
-pub struct TcpMechanism<'a> {
-    /// KV-dictionary with PostgreSQL connection params.
-    pub params: &'a StartupMessageParams,
-    pub proto: &'static str,
-}
-
-#[async_trait]
-impl ConnectMechanism for TcpMechanism<'_> {
-    type Connection = PostgresConnection;
-    type ConnectError = compute::ConnectionError;
-    type Error = compute::ConnectionError;
-
-    async fn connect_once(
-        &self,
-        node_info: &console::CachedNodeInfo,
-        timeout: time::Duration,
-    ) -> Result<PostgresConnection, Self::Error> {
-        connect_to_compute_once(node_info, timeout, self.proto).await
-    }
-
-    fn update_connect_config(&self, config: &mut compute::ConnCfg) {
-        config.set_startup_params(self.params);
-    }
-}
-
-const fn bool_to_str(x: bool) -> &'static str {
-    if x {
-        "true"
-    } else {
-        "false"
-    }
-}
-
-fn report_error(e: &WakeComputeError, retry: bool) {
-    use crate::console::errors::ApiError;
-    let retry = bool_to_str(retry);
-    let kind = match e {
-        WakeComputeError::BadComputeAddress(_) => "bad_compute_address",
-        WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error",
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::LOCKED,
-            ref text,
-        }) if text.contains("written data quota exceeded")
-            || text.contains("the limit for current plan reached") =>
-        {
-            "quota_exceeded"
-        }
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::LOCKED,
-            ..
-        }) => "api_console_locked",
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::BAD_REQUEST,
-            ..
-        }) => "api_console_bad_request",
-        WakeComputeError::ApiError(ApiError::Console { status, .. })
-            if status.is_server_error() =>
-        {
-            "api_console_other_server_error"
-        }
-        WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
-        WakeComputeError::TimeoutError => "timeout_error",
-    };
-    NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
-}
-
-/// Try to connect to the compute node, retrying if necessary.
-/// This function might update `node_info`, so we take it by `&mut`.
-#[tracing::instrument(skip_all)]
-pub async fn connect_to_compute<M: ConnectMechanism>(
-    mechanism: &M,
-    mut node_info: console::CachedNodeInfo,
-    extra: &console::ConsoleReqExtra,
-    creds: &auth::BackendType<'_, auth::backend::ComputeUserInfo>,
-    mut latency_timer: LatencyTimer,
-) -> Result<M::Connection, M::Error>
-where
-    M::ConnectError: ShouldRetry + std::fmt::Debug,
-    M::Error: From<WakeComputeError>,
-{
-    mechanism.update_connect_config(&mut node_info.config);
-
-    // try once
-    let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
-        Ok(res) => {
-            latency_timer.success();
-            return Ok(res);
-        }
-        Err(e) => {
-            error!(error = ?e, "could not connect to compute node");
-            (invalidate_cache(node_info), e)
-        }
-    };
-
-    latency_timer.cache_miss();
-
-    let mut num_retries = 1;
-
-    // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
-    info!("compute node's state has likely changed; requesting a wake-up");
-    let node_info = loop {
-        let wake_res = match creds {
-            auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
-            #[cfg(feature = "testing")]
-            auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
-            // nothing to do?
-            auth::BackendType::Link(_) => return Err(err.into()),
-            // test backend
-            #[cfg(test)]
-            auth::BackendType::Test(x) => x.wake_compute(),
-        };
-
-        match handle_try_wake(wake_res, num_retries) {
-            Err(e) => {
-                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
-                report_error(&e, false);
-                return Err(e.into());
-            }
-            // failed to wake up but we can continue to retry
-            Ok(ControlFlow::Continue(e)) => {
-                report_error(&e, true);
-                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
-            }
-            // successfully woke up a compute node and can break the wakeup loop
-            Ok(ControlFlow::Break(mut node_info)) => {
-                node_info.config.reuse_password(&config);
-                mechanism.update_connect_config(&mut node_info.config);
-                break node_info;
-            }
-        }
-
-        let wait_duration = retry_after(num_retries);
-        num_retries += 1;
-
-        time::sleep(wait_duration).await;
-    };
-
-    // now that we have a new node, try connect to it repeatedly.
-    // this can error for a few reasons, for instance:
-    // * DNS connection settings haven't quite propagated yet
-    info!("wake_compute success. attempting to connect");
-    loop {
-        match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
-            Ok(res) => {
-                latency_timer.success();
-                return Ok(res);
-            }
-            Err(e) => {
-                let retriable = e.should_retry(num_retries);
-                if !retriable {
-                    error!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
-                    return Err(e.into());
-                }
-                warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
-            }
-        }
-
-        let wait_duration = retry_after(num_retries);
-        num_retries += 1;
-
-        time::sleep(wait_duration).await;
-    }
-}
-
-/// Attempts to wake up the compute node.
-/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
-/// * Returns Ok(Break(node)) if the wakeup succeeded
-/// * Returns Err(e) if there was an error
-pub fn handle_try_wake(
-    result: Result<console::CachedNodeInfo, WakeComputeError>,
-    num_retries: u32,
-) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
-    match result {
-        Err(err) => match &err {
-            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
-                Ok(ControlFlow::Continue(err))
-            }
-            _ => Err(err),
-        },
-        // Ready to try again.
-        Ok(new) => Ok(ControlFlow::Break(new)),
-    }
-}
-
-pub trait ShouldRetry {
-    fn could_retry(&self) -> bool;
-    fn should_retry(&self, num_retries: u32) -> bool {
-        match self {
-            _ if num_retries >= NUM_RETRIES_CONNECT => false,
-            err => err.could_retry(),
-        }
-    }
-}
-
-impl ShouldRetry for io::Error {
-    fn could_retry(&self) -> bool {
-        use std::io::ErrorKind;
-        matches!(
-            self.kind(),
-            ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable | ErrorKind::TimedOut
-        )
-    }
-}
-
-impl ShouldRetry for tokio_postgres::error::DbError {
-    fn could_retry(&self) -> bool {
-        use tokio_postgres::error::SqlState;
-        matches!(
-            self.code(),
-            &SqlState::CONNECTION_FAILURE
-                | &SqlState::CONNECTION_EXCEPTION
-                | &SqlState::CONNECTION_DOES_NOT_EXIST
-                | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
-        )
-    }
-}
-
-impl ShouldRetry for tokio_postgres::Error {
-    fn could_retry(&self) -> bool {
-        if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) {
-            io::Error::could_retry(io_err)
-        } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
-            tokio_postgres::error::DbError::could_retry(db_err)
-        } else {
-            false
-        }
-    }
-}
-
-impl ShouldRetry for compute::ConnectionError {
-    fn could_retry(&self) -> bool {
-        match self {
-            compute::ConnectionError::Postgres(err) => err.could_retry(),
-            compute::ConnectionError::CouldNotConnect(err) => err.could_retry(),
-            _ => false,
-        }
-    }
-}
-
-pub fn retry_after(num_retries: u32) -> time::Duration {
-    BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1))
-}
-
 /// Finish client connection initialization: confirm auth success, send params, etc.
 #[tracing::instrument(skip_all)]
 async fn prepare_client_connection(
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
new file mode 100644
index 0000000000..88b0019c49
--- /dev/null
+++ b/proxy/src/proxy/connect_compute.rs
@@ -0,0 +1,238 @@
+use crate::{
+    auth,
+    compute::{self, PostgresConnection},
+    console::{self, errors::WakeComputeError, Api},
+    metrics::{bool_to_str, LatencyTimer, NUM_CONNECTION_FAILURES, NUM_WAKEUP_FAILURES},
+    proxy::retry::{retry_after, ShouldRetry},
+};
+use async_trait::async_trait;
+use hyper::StatusCode;
+use pq_proto::StartupMessageParams;
+use std::ops::ControlFlow;
+use tokio::time;
+use tracing::{error, info, warn};
+
+const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
+
+/// If we couldn't connect, a cached connection info might be to blame
+/// (e.g. the compute node's address might've changed at the wrong time).
+/// Invalidate the cache entry (if any) to prevent subsequent errors.
+#[tracing::instrument(name = "invalidate_cache", skip_all)]
+pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg {
+    let is_cached = node_info.cached();
+    if is_cached {
+        warn!("invalidating stalled compute node info cache entry");
+    }
+    let label = match is_cached {
+        true => "compute_cached",
+        false => "compute_uncached",
+    };
+    NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
+
+    node_info.invalidate().config
+}
+
+/// Try to connect to the compute node once.
+#[tracing::instrument(name = "connect_once", fields(pid = tracing::field::Empty), skip_all)]
+async fn connect_to_compute_once(
+    node_info: &console::CachedNodeInfo,
+    timeout: time::Duration,
+    proto: &'static str,
+) -> Result<PostgresConnection, compute::ConnectionError> {
+    let allow_self_signed_compute = node_info.allow_self_signed_compute;
+
+    node_info
+        .config
+        .connect(allow_self_signed_compute, timeout, proto)
+        .await
+}
+
+#[async_trait]
+pub trait ConnectMechanism {
+    type Connection;
+    type ConnectError;
+    type Error: From<Self::ConnectError>;
+    async fn connect_once(
+        &self,
+        node_info: &console::CachedNodeInfo,
+        timeout: time::Duration,
+    ) -> Result<Self::Connection, Self::ConnectError>;
+
+    fn update_connect_config(&self, conf: &mut compute::ConnCfg);
+}
+
+pub struct TcpMechanism<'a> {
+    /// KV-dictionary with PostgreSQL connection params.
+    pub params: &'a StartupMessageParams,
+    pub proto: &'static str,
+}
+
+#[async_trait]
+impl ConnectMechanism for TcpMechanism<'_> {
+    type Connection = PostgresConnection;
+    type ConnectError = compute::ConnectionError;
+    type Error = compute::ConnectionError;
+
+    async fn connect_once(
+        &self,
+        node_info: &console::CachedNodeInfo,
+        timeout: time::Duration,
+    ) -> Result<PostgresConnection, Self::Error> {
+        connect_to_compute_once(node_info, timeout, self.proto).await
+    }
+
+    fn update_connect_config(&self, config: &mut compute::ConnCfg) {
+        config.set_startup_params(self.params);
+    }
+}
+
+fn report_error(e: &WakeComputeError, retry: bool) {
+    use crate::console::errors::ApiError;
+    let retry = bool_to_str(retry);
+    let kind = match e {
+        WakeComputeError::BadComputeAddress(_) => "bad_compute_address",
+        WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error",
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::LOCKED,
+            ref text,
+        }) if text.contains("written data quota exceeded")
+            || text.contains("the limit for current plan reached") =>
+        {
+            "quota_exceeded"
+        }
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::LOCKED,
+            ..
+        }) => "api_console_locked",
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::BAD_REQUEST,
+            ..
+        }) => "api_console_bad_request",
+        WakeComputeError::ApiError(ApiError::Console { status, .. })
+            if status.is_server_error() =>
+        {
+            "api_console_other_server_error"
+        }
+        WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
+        WakeComputeError::TimeoutError => "timeout_error",
+    };
+    NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
+}
+
+/// Try to connect to the compute node, retrying if necessary.
+/// This function might update `node_info`, so we take it by `&mut`.
+#[tracing::instrument(skip_all)]
+pub async fn connect_to_compute<M: ConnectMechanism>(
+    mechanism: &M,
+    mut node_info: console::CachedNodeInfo,
+    extra: &console::ConsoleReqExtra,
+    creds: &auth::BackendType<'_, auth::backend::ComputeUserInfo>,
+    mut latency_timer: LatencyTimer,
+) -> Result<M::Connection, M::Error>
+where
+    M::ConnectError: ShouldRetry + std::fmt::Debug,
+    M::Error: From<WakeComputeError>,
+{
+    mechanism.update_connect_config(&mut node_info.config);
+
+    // try once
+    let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
+        Ok(res) => {
+            latency_timer.success();
+            return Ok(res);
+        }
+        Err(e) => {
+            error!(error = ?e, "could not connect to compute node");
+            (invalidate_cache(node_info), e)
+        }
+    };
+
+    latency_timer.cache_miss();
+
+    let mut num_retries = 1;
+
+    // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
+    info!("compute node's state has likely changed; requesting a wake-up");
+    let node_info = loop {
+        let wake_res = match creds {
+            auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
+            #[cfg(feature = "testing")]
+            auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
+            // nothing to do?
+            auth::BackendType::Link(_) => return Err(err.into()),
+            // test backend
+            #[cfg(test)]
+            auth::BackendType::Test(x) => x.wake_compute(),
+        };
+
+        match handle_try_wake(wake_res, num_retries) {
+            Err(e) => {
+                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
+                report_error(&e, false);
+                return Err(e.into());
+            }
+            // failed to wake up but we can continue to retry
+            Ok(ControlFlow::Continue(e)) => {
+                report_error(&e, true);
+                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
+            }
+            // successfully woke up a compute node and can break the wakeup loop
+            Ok(ControlFlow::Break(mut node_info)) => {
+                node_info.config.reuse_password(&config);
+                mechanism.update_connect_config(&mut node_info.config);
+                break node_info;
+            }
+        }
+
+        let wait_duration = retry_after(num_retries);
+        num_retries += 1;
+
+        time::sleep(wait_duration).await;
+    };
+
+    // now that we have a new node, try connect to it repeatedly.
+    // this can error for a few reasons, for instance:
+    // * DNS connection settings haven't quite propagated yet
+    info!("wake_compute success. attempting to connect");
+    loop {
+        match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
+            Ok(res) => {
+                latency_timer.success();
+                return Ok(res);
+            }
+            Err(e) => {
+                let retriable = e.should_retry(num_retries);
+                if !retriable {
+                    error!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
+                    return Err(e.into());
+                }
+                warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
+            }
+        }
+
+        let wait_duration = retry_after(num_retries);
+        num_retries += 1;
+
+        time::sleep(wait_duration).await;
+    }
+}
+
+/// Attempts to wake up the compute node.
+/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
+/// * Returns Ok(Break(node)) if the wakeup succeeded
+/// * Returns Err(e) if there was an error
+pub fn handle_try_wake(
+    result: Result<console::CachedNodeInfo, WakeComputeError>,
+    num_retries: u32,
+) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
+    match result {
+        Err(err) => match &err {
+            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
+                Ok(ControlFlow::Continue(err))
+            }
+            _ => Err(err),
+        },
+        // Ready to try again.
+        Ok(new) => Ok(ControlFlow::Break(new)),
+    }
+}
diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs
new file mode 100644
index 0000000000..a85ed380b0
--- /dev/null
+++ b/proxy/src/proxy/retry.rs
@@ -0,0 +1,68 @@
+use crate::compute;
+use std::{error::Error, io};
+use tokio::time;
+
+/// Number of times we should retry the `/proxy_wake_compute` http request.
+/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0
+pub const NUM_RETRIES_CONNECT: u32 = 16;
+const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25);
+const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2;
+
+pub trait ShouldRetry {
+    fn could_retry(&self) -> bool;
+    fn should_retry(&self, num_retries: u32) -> bool {
+        match self {
+            _ if num_retries >= NUM_RETRIES_CONNECT => false,
+            err => err.could_retry(),
+        }
+    }
+}
+
+impl ShouldRetry for io::Error {
+    fn could_retry(&self) -> bool {
+        use std::io::ErrorKind;
+        matches!(
+            self.kind(),
+            ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable | ErrorKind::TimedOut
+        )
+    }
+}
+
+impl ShouldRetry for tokio_postgres::error::DbError {
+    fn could_retry(&self) -> bool {
+        use tokio_postgres::error::SqlState;
+        matches!(
+            self.code(),
+            &SqlState::CONNECTION_FAILURE
+                | &SqlState::CONNECTION_EXCEPTION
+                | &SqlState::CONNECTION_DOES_NOT_EXIST
+                | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
+        )
+    }
+}
+
+impl ShouldRetry for tokio_postgres::Error {
+    fn could_retry(&self) -> bool {
+        if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) {
+            io::Error::could_retry(io_err)
+        } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
+            tokio_postgres::error::DbError::could_retry(db_err)
+        } else {
+            false
+        }
+    }
+}
+
+impl ShouldRetry for compute::ConnectionError {
+    fn could_retry(&self) -> bool {
+        match self {
+            compute::ConnectionError::Postgres(err) => err.could_retry(),
+            compute::ConnectionError::CouldNotConnect(err) => err.could_retry(),
+            _ => false,
+        }
+    }
+}
+
+pub fn retry_after(num_retries: u32) -> time::Duration {
+    BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1))
+}
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 4691abbfb9..3c483c59ee 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -2,10 +2,13 @@
 
 mod mitm;
 
+use super::connect_compute::ConnectMechanism;
+use super::retry::ShouldRetry;
 use super::*;
 use crate::auth::backend::{ComputeUserInfo, TestBackend};
 use crate::config::CertResolver;
 use crate::console::{CachedNodeInfo, NodeInfo};
+use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
 use crate::{auth, http, sasl, scram};
 use async_trait::async_trait;
 use rstest::rstest;
@@ -423,7 +426,7 @@ impl ConnectMechanism for TestConnectMechanism {
     async fn connect_once(
         &self,
         _node_info: &console::CachedNodeInfo,
-        _timeout: time::Duration,
+        _timeout: std::time::Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
         let mut counter = self.counter.lock().unwrap();
         let action = self.sequence[*counter];
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index 50b3034936..a0a84a1dc0 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -120,7 +120,7 @@ where
 struct PgFrame;
 impl Decoder for PgFrame {
     type Item = Bytes;
-    type Error = io::Error;
+    type Error = std::io::Error;
 
     fn decode(&mut self, src: &mut BytesMut) -> Result<Option<Self::Item>, Self::Error> {
         if src.len() < 5 {
@@ -136,7 +136,7 @@ impl Decoder for PgFrame {
     }
 }
 impl Encoder<Bytes> for PgFrame {
-    type Error = io::Error;
+    type Error = std::io::Error;
 
     fn encode(&mut self, item: Bytes, dst: &mut BytesMut) -> Result<(), Self::Error> {
         dst.extend_from_slice(&item);
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 8dfdfcd3db..a190b2cf8f 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -393,10 +393,10 @@ impl Limiter {
             }
             new_limit
         };
-        crate::proxy::RATE_LIMITER_LIMIT
+        crate::metrics::RATE_LIMITER_LIMIT
             .with_label_values(&["expected"])
             .set(new_limit as i64);
-        crate::proxy::RATE_LIMITER_LIMIT
+        crate::metrics::RATE_LIMITER_LIMIT
             .with_label_values(&["actual"])
             .set(actual_limit as i64);
         self.limits.store(new_limit, Ordering::Release);
@@ -470,7 +470,7 @@ impl reqwest_middleware::Middleware for Limiter {
                 )
             })?;
         info!(duration = ?start.elapsed(), "waiting for token to connect to the control plane");
-        crate::proxy::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64());
+        crate::metrics::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64());
         match next.run(req, extensions).await {
             Ok(response) => {
                 self.release(token, Some(Outcome::from_reqwest_response(&response)))
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 870e9c1103..e358a0712f 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -13,8 +13,8 @@ pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio_util::task::TaskTracker;
 
+use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
 use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
-use crate::proxy::NUM_CLIENT_CONNECTION_GAUGE;
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::{cancellation::CancelMap, config::ProxyConfig};
 use futures::StreamExt;
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 69198d79d3..ab8903418b 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -24,13 +24,12 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
 use crate::{
     auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list},
     console,
-    proxy::{neon_options, LatencyTimer, NUM_DB_CONNECTIONS_GAUGE},
+    metrics::{LatencyTimer, NUM_DB_CONNECTIONS_GAUGE},
+    proxy::{connect_compute::ConnectMechanism, neon_options},
     usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
 };
 use crate::{compute, config};
 
-use crate::proxy::ConnectMechanism;
-
 use tracing::{error, warn, Span};
 use tracing::{info, info_span, Instrument};
 
@@ -444,7 +443,7 @@ async fn connect_to_compute(
         .await?
         .context("missing cache entry from wake_compute")?;
 
-    crate::proxy::connect_to_compute(
+    crate::proxy::connect_compute::connect_to_compute(
         &TokioMechanism {
             conn_id,
             conn_info,
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 795ba819c1..307b085ce0 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -29,7 +29,7 @@ use utils::http::error::ApiError;
 use utils::http::json::json_response;
 
 use crate::config::HttpConfig;
-use crate::proxy::NUM_CONNECTION_REQUESTS_GAUGE;
+use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 
 use super::conn_pool::ConnInfo;
 use super::conn_pool::GlobalConnPool;

From 33cb9a68f7dde14a851ff5893f7107b4fc53856e Mon Sep 17 00:00:00 2001
From: John Khvatov <ivaxer@yandex-team.com>
Date: Mon, 18 Dec 2023 16:33:23 +0300
Subject: [PATCH 19/19] pageserver: Reduce tracing overhead in timeline::get
 (#6115)

## Problem

Compaction process (specifically the image layer reconstructions part)
is lagging behind wal ingest (at speed ~10-15MB/s) for medium-sized
tenants (30-50GB). CPU profile shows that significant amount of time
(see flamegraph) is being spent in `tracing::span::Span::new`.

mainline (commit: 0ba4cae491c26c7678f7abddb68bf76134a7df90):

![reconstruct-mainline-0ba4cae491c2](https://github.com/neondatabase/neon/assets/289788/ebfd262e-5c97-4858-80c7-664a1dbcc59d)

## Summary of changes

By lowering the tracing level in get_value_reconstruct_data and
get_or_maybe_download from info to debug, we can reduce the overhead of
span creation in prod environments. On my system, this sped up the image
reconstruction process by 60% (from 14500 to 23160 page reconstruction
per sec)

pr:

![reconstruct-opt-2](https://github.com/neondatabase/neon/assets/289788/563a159b-8f2f-4300-b0a1-6cd66e7df769)


`create_image_layers()` (it's 1 CPU bound here) mainline vs pr:

![image](https://github.com/neondatabase/neon/assets/289788/a981e3cb-6df9-4882-8a94-95e99c35aa83)
---
 pageserver/src/tenant/storage_layer/layer.rs | 160 ++++++++++---------
 test_runner/regress/test_broken_timeline.py  |   4 +-
 2 files changed, 85 insertions(+), 79 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index a4b102c314..9a8ddc1a6b 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -259,8 +259,9 @@ impl Layer {
 
         layer
             .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
-            .instrument(tracing::info_span!("get_value_reconstruct_data", layer=%self))
+            .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
             .await
+            .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
     }
 
     /// Download the layer if evicted.
@@ -654,7 +655,6 @@ impl LayerInner {
     }
 
     /// Cancellation safe.
-    #[tracing::instrument(skip_all, fields(layer=%self))]
     async fn get_or_maybe_download(
         self: &Arc<Self>,
         allow_download: bool,
@@ -663,95 +663,101 @@ impl LayerInner {
         let mut init_permit = None;
 
         loop {
-            let download = move |permit| async move {
-                // disable any scheduled but not yet running eviction deletions for this
-                let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
+            let download = move |permit| {
+                async move {
+                    // disable any scheduled but not yet running eviction deletions for this
+                    let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
 
-                // count cancellations, which currently remain largely unexpected
-                let init_cancelled =
-                    scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
+                    // count cancellations, which currently remain largely unexpected
+                    let init_cancelled =
+                        scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
 
-                // no need to make the evict_and_wait wait for the actual download to complete
-                drop(self.status.send(Status::Downloaded));
+                    // no need to make the evict_and_wait wait for the actual download to complete
+                    drop(self.status.send(Status::Downloaded));
 
-                let timeline = self
-                    .timeline
-                    .upgrade()
-                    .ok_or_else(|| DownloadError::TimelineShutdown)?;
+                    let timeline = self
+                        .timeline
+                        .upgrade()
+                        .ok_or_else(|| DownloadError::TimelineShutdown)?;
 
-                // FIXME: grab a gate
+                    // FIXME: grab a gate
 
-                let can_ever_evict = timeline.remote_client.as_ref().is_some();
+                    let can_ever_evict = timeline.remote_client.as_ref().is_some();
 
-                // check if we really need to be downloaded; could have been already downloaded by a
-                // cancelled previous attempt.
-                let needs_download = self
-                    .needs_download()
-                    .await
-                    .map_err(DownloadError::PreStatFailed)?;
+                    // check if we really need to be downloaded; could have been already downloaded by a
+                    // cancelled previous attempt.
+                    let needs_download = self
+                        .needs_download()
+                        .await
+                        .map_err(DownloadError::PreStatFailed)?;
 
-                let permit = if let Some(reason) = needs_download {
-                    if let NeedsDownload::NotFile(ft) = reason {
-                        return Err(DownloadError::NotFile(ft));
+                    let permit = if let Some(reason) = needs_download {
+                        if let NeedsDownload::NotFile(ft) = reason {
+                            return Err(DownloadError::NotFile(ft));
+                        }
+
+                        // only reset this after we've decided we really need to download. otherwise it'd
+                        // be impossible to mark cancelled downloads for eviction, like one could imagine
+                        // we would like to do for prefetching which was not needed.
+                        self.wanted_evicted.store(false, Ordering::Release);
+
+                        if !can_ever_evict {
+                            return Err(DownloadError::NoRemoteStorage);
+                        }
+
+                        if let Some(ctx) = ctx {
+                            self.check_expected_download(ctx)?;
+                        }
+
+                        if !allow_download {
+                            // this does look weird, but for LayerInner the "downloading" means also changing
+                            // internal once related state ...
+                            return Err(DownloadError::DownloadRequired);
+                        }
+
+                        tracing::info!(%reason, "downloading on-demand");
+
+                        self.spawn_download_and_wait(timeline, permit).await?
+                    } else {
+                        // the file is present locally, probably by a previous but cancelled call to
+                        // get_or_maybe_download. alternatively we might be running without remote storage.
+                        LAYER_IMPL_METRICS.inc_init_needed_no_download();
+
+                        permit
+                    };
+
+                    let since_last_eviction =
+                        self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
+                    if let Some(since_last_eviction) = since_last_eviction {
+                        // FIXME: this will not always be recorded correctly until #6028 (the no
+                        // download needed branch above)
+                        LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
                     }
 
-                    // only reset this after we've decided we really need to download. otherwise it'd
-                    // be impossible to mark cancelled downloads for eviction, like one could imagine
-                    // we would like to do for prefetching which was not needed.
-                    self.wanted_evicted.store(false, Ordering::Release);
+                    let res = Arc::new(DownloadedLayer {
+                        owner: Arc::downgrade(self),
+                        kind: tokio::sync::OnceCell::default(),
+                        version: next_version,
+                    });
 
-                    if !can_ever_evict {
-                        return Err(DownloadError::NoRemoteStorage);
+                    self.access_stats.record_residence_event(
+                        LayerResidenceStatus::Resident,
+                        LayerResidenceEventReason::ResidenceChange,
+                    );
+
+                    let waiters = self.inner.initializer_count();
+                    if waiters > 0 {
+                        tracing::info!(
+                            waiters,
+                            "completing the on-demand download for other tasks"
+                        );
                     }
 
-                    if let Some(ctx) = ctx {
-                        self.check_expected_download(ctx)?;
-                    }
+                    scopeguard::ScopeGuard::into_inner(init_cancelled);
 
-                    if !allow_download {
-                        // this does look weird, but for LayerInner the "downloading" means also changing
-                        // internal once related state ...
-                        return Err(DownloadError::DownloadRequired);
-                    }
-
-                    tracing::info!(%reason, "downloading on-demand");
-
-                    self.spawn_download_and_wait(timeline, permit).await?
-                } else {
-                    // the file is present locally, probably by a previous but cancelled call to
-                    // get_or_maybe_download. alternatively we might be running without remote storage.
-                    LAYER_IMPL_METRICS.inc_init_needed_no_download();
-
-                    permit
-                };
-
-                let since_last_eviction =
-                    self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
-                if let Some(since_last_eviction) = since_last_eviction {
-                    // FIXME: this will not always be recorded correctly until #6028 (the no
-                    // download needed branch above)
-                    LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
+                    Ok((ResidentOrWantedEvicted::Resident(res), permit))
                 }
-
-                let res = Arc::new(DownloadedLayer {
-                    owner: Arc::downgrade(self),
-                    kind: tokio::sync::OnceCell::default(),
-                    version: next_version,
-                });
-
-                self.access_stats.record_residence_event(
-                    LayerResidenceStatus::Resident,
-                    LayerResidenceEventReason::ResidenceChange,
-                );
-
-                let waiters = self.inner.initializer_count();
-                if waiters > 0 {
-                    tracing::info!(waiters, "completing the on-demand download for other tasks");
-                }
-
-                scopeguard::ScopeGuard::into_inner(init_cancelled);
-
-                Ok((ResidentOrWantedEvicted::Resident(res), permit))
+                .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
             };
 
             if let Some(init_permit) = init_permit.take() {
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 53eeb8bbe9..4da0ba7b20 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -20,7 +20,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
 
     env.pageserver.allowed_errors.extend(
         [
-            ".*layer loading failed:.*",
+            ".*get_value_reconstruct_data for layer .*",
             ".*could not find data for key.*",
             ".*is not active. Current state: Broken.*",
             ".*will not become active. Current state: Broken.*",
@@ -83,7 +83,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
     # (We don't check layer file contents on startup, when loading the timeline)
     #
     # This will change when we implement checksums for layers
-    with pytest.raises(Exception, match="layer loading failed:") as err:
+    with pytest.raises(Exception, match="get_value_reconstruct_data for layer ") as err:
         pg2.start()
     log.info(
         f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"