Merge remote-tracking branch 'origin/main' into arpad/lsn_by_ts

2026-05-29 19:10:38 +00:00 · 2023-11-07 05:40:46 +01:00
parent 090246cad7 ad5b02e175
commit 52017eff68
103 changed files with 5783 additions and 3587 deletions
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -6,7 +6,6 @@
 use std::collections::HashMap;

 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

@@ -19,7 +18,6 @@ pub type PgIdent = String;

 /// Cluster spec or configuration represented as an optional number of
 /// delta operations + final cluster state description.
-#[serde_as]
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct ComputeSpec {
    pub format_version: f32,
@@ -50,12 +48,12 @@ pub struct ComputeSpec {
    // these, and instead set the "neon.tenant_id", "neon.timeline_id",
    // etc. GUCs in cluster.settings. TODO: Once the control plane has been
    // updated to fill these fields, we can make these non optional.
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub tenant_id: Option<TenantId>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
+
    pub timeline_id: Option<TimelineId>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
+
    pub pageserver_connstring: Option<String>,
+
    #[serde(default)]
    pub safekeeper_connstrings: Vec<String>,

@@ -140,14 +138,13 @@ impl RemoteExtSpec {
    }
 }

-#[serde_as]
 #[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
 pub enum ComputeMode {
    /// A read-write node
    #[default]
    Primary,
    /// A read-only node, pinned at a particular LSN
-    Static(#[serde_as(as = "DisplayFromStr")] Lsn),
+    Static(Lsn),
    /// A read-only node that follows the tip of the branch in hot standby mode
    ///
    /// Future versions may want to distinguish between replicas with hot standby
--- a/libs/pageserver_api/src/control_api.rs
+++ b/libs/pageserver_api/src/control_api.rs
@@ -4,7 +4,6 @@
 //! See docs/rfcs/025-generation-numbers.md

 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{NodeId, TenantId};

 #[derive(Serialize, Deserialize)]
@@ -12,10 +11,8 @@ pub struct ReAttachRequest {
    pub node_id: NodeId,
 }

-#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachResponseTenant {
-    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    pub gen: u32,
 }
@@ -25,10 +22,8 @@ pub struct ReAttachResponse {
    pub tenants: Vec<ReAttachResponseTenant>,
 }

-#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ValidateRequestTenant {
-    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    pub gen: u32,
 }
@@ -43,10 +38,8 @@ pub struct ValidateResponse {
    pub tenants: Vec<ValidateResponseTenant>,
 }

-#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ValidateResponseTenant {
-    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    pub valid: bool,
 }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -6,7 +6,7 @@ use std::{

 use byteorder::{BigEndian, ReadBytesExt};
 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
+use serde_with::serde_as;
 use strum_macros;
 use utils::{
    completion,
@@ -174,25 +174,19 @@ pub enum TimelineState {
    Broken { reason: String, backtrace: String },
 }

-#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
-    #[serde_as(as = "DisplayFromStr")]
    pub new_timeline_id: TimelineId,
    #[serde(default)]
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_timeline_id: Option<TimelineId>,
    #[serde(default)]
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_start_lsn: Option<Lsn>,
    pub pg_version: Option<u32>,
 }

-#[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantCreateRequest {
-    #[serde_as(as = "DisplayFromStr")]
    pub new_tenant_id: TenantId,
    #[serde(default)]
    #[serde(skip_serializing_if = "Option::is_none")]
@@ -201,7 +195,6 @@ pub struct TenantCreateRequest {
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

-#[serde_as]
 #[derive(Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLoadRequest {
@@ -278,31 +271,26 @@ pub struct LocationConfig {
    pub tenant_conf: TenantConfig,
 }

-#[serde_as]
 #[derive(Serialize, Deserialize)]
 #[serde(transparent)]
-pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub TenantId);
+pub struct TenantCreateResponse(pub TenantId);

 #[derive(Serialize)]
 pub struct StatusResponse {
    pub id: NodeId,
 }

-#[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
-    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
    #[serde(flatten)]
    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

-#[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantConfigRequest {
-    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
    #[serde(flatten)]
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
@@ -374,10 +362,8 @@ pub enum TenantAttachmentStatus {
    Failed { reason: String },
 }

-#[serde_as]
 #[derive(Serialize, Deserialize, Clone)]
 pub struct TenantInfo {
-    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
    pub state: TenantState,
@@ -388,33 +374,22 @@ pub struct TenantInfo {
 }

 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
-#[serde_as]
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
-    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
-    #[serde_as(as = "DisplayFromStr")]
    pub timeline_id: TimelineId,

-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_timeline_id: Option<TimelineId>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_lsn: Option<Lsn>,
-    #[serde_as(as = "DisplayFromStr")]
    pub last_record_lsn: Lsn,
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub prev_record_lsn: Option<Lsn>,
-    #[serde_as(as = "DisplayFromStr")]
    pub latest_gc_cutoff_lsn: Lsn,
-    #[serde_as(as = "DisplayFromStr")]
    pub disk_consistent_lsn: Lsn,

    /// The LSN that we have succesfully uploaded to remote storage
-    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn: Lsn,

    /// The LSN that we are advertizing to safekeepers
-    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn_visible: Lsn,

    pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
@@ -426,7 +401,6 @@ pub struct TimelineInfo {
    pub timeline_dir_layer_file_size_sum: Option<u64>,

    pub wal_source_connstr: Option<String>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub last_received_msg_lsn: Option<Lsn>,
    /// the timestamp (in microseconds) of the last received message
    pub last_received_msg_ts: Option<u128>,
@@ -523,23 +497,13 @@ pub struct LayerAccessStats {
    pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
 }

-#[serde_as]
 #[derive(Debug, Clone, Serialize)]
 #[serde(tag = "kind")]
 pub enum InMemoryLayerInfo {
-    Open {
-        #[serde_as(as = "DisplayFromStr")]
-        lsn_start: Lsn,
-    },
-    Frozen {
-        #[serde_as(as = "DisplayFromStr")]
-        lsn_start: Lsn,
-        #[serde_as(as = "DisplayFromStr")]
-        lsn_end: Lsn,
-    },
+    Open { lsn_start: Lsn },
+    Frozen { lsn_start: Lsn, lsn_end: Lsn },
 }

-#[serde_as]
 #[derive(Debug, Clone, Serialize)]
 #[serde(tag = "kind")]
 pub enum HistoricLayerInfo {
@@ -547,9 +511,7 @@ pub enum HistoricLayerInfo {
        layer_file_name: String,
        layer_file_size: u64,

-        #[serde_as(as = "DisplayFromStr")]
        lsn_start: Lsn,
-        #[serde_as(as = "DisplayFromStr")]
        lsn_end: Lsn,
        remote: bool,
        access_stats: LayerAccessStats,
@@ -558,7 +520,6 @@ pub enum HistoricLayerInfo {
        layer_file_name: String,
        layer_file_size: u64,

-        #[serde_as(as = "DisplayFromStr")]
        lsn_start: Lsn,
        remote: bool,
        access_stats: LayerAccessStats,
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -728,12 +728,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {

                trace!("got query {query_string:?}");
                if let Err(e) = handler.process_query(self, query_string).await {
-                    log_query_error(query_string, &e);
-                    let short_error = short_error(&e);
-                    self.write_message_noflush(&BeMessage::ErrorResponse(
-                        &short_error,
-                        Some(e.pg_error_code()),
-                    ))?;
+                    match e {
+                        QueryError::Shutdown => return Ok(ProcessMsgResult::Break),
+                        e => {
+                            log_query_error(query_string, &e);
+                            let short_error = short_error(&e);
+                            self.write_message_noflush(&BeMessage::ErrorResponse(
+                                &short_error,
+                                Some(e.pg_error_code()),
+                            ))?;
+                        }
+                    }
                }
                self.write_message_noflush(&BeMessage::ReadyForQuery)?;
            }
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -1,21 +1,18 @@
 //! Azure Blob Storage wrapper

+use std::collections::HashMap;
 use std::env;
 use std::num::NonZeroU32;
 use std::sync::Arc;
-use std::{borrow::Cow, collections::HashMap, io::Cursor};
+use std::{borrow::Cow, io::Cursor};

 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Result;
 use azure_core::request_options::{MaxResults, Metadata, Range};
-use azure_core::Header;
 use azure_identity::DefaultAzureCredential;
 use azure_storage::StorageCredentials;
 use azure_storage_blobs::prelude::ClientBuilder;
-use azure_storage_blobs::{
-    blob::operations::GetBlobBuilder,
-    prelude::{BlobClient, ContainerClient},
-};
+use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
 use futures_util::StreamExt;
 use http_types::StatusCode;
 use tokio::io::AsyncRead;
@@ -112,16 +109,19 @@ impl AzureBlobStorage {

    async fn download_for_builder(
        &self,
-        metadata: StorageMetadata,
        builder: GetBlobBuilder,
    ) -> Result<Download, DownloadError> {
        let mut response = builder.into_stream();

+        let mut metadata = HashMap::new();
        // TODO give proper streaming response instead of buffering into RAM
        // https://github.com/neondatabase/neon/issues/5563
        let mut buf = Vec::new();
        while let Some(part) = response.next().await {
            let part = part.map_err(to_download_error)?;
+            if let Some(blob_meta) = part.blob.metadata {
+                metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
+            }
            let data = part
                .data
                .collect()
@@ -131,28 +131,9 @@ impl AzureBlobStorage {
        }
        Ok(Download {
            download_stream: Box::pin(Cursor::new(buf)),
-            metadata: Some(metadata),
+            metadata: Some(StorageMetadata(metadata)),
        })
    }
-    // TODO get rid of this function once we have metadata included in the response
-    // https://github.com/Azure/azure-sdk-for-rust/issues/1439
-    async fn get_metadata(
-        &self,
-        blob_client: &BlobClient,
-    ) -> Result<StorageMetadata, DownloadError> {
-        let builder = blob_client.get_metadata();
-
-        let response = builder.into_future().await.map_err(to_download_error)?;
-        let mut map = HashMap::new();
-
-        for md in response.metadata.iter() {
-            map.insert(
-                md.name().as_str().to_string(),
-                md.value().as_str().to_string(),
-            );
-        }
-        Ok(StorageMetadata(map))
-    }

    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
        self.concurrency_limiter
@@ -269,11 +250,9 @@ impl RemoteStorage for AzureBlobStorage {
        let _permit = self.permit(RequestKind::Get).await;
        let blob_client = self.client.blob_client(self.relative_path_to_name(from));

-        let metadata = self.get_metadata(&blob_client).await?;
-
        let builder = blob_client.get();

-        self.download_for_builder(metadata, builder).await
+        self.download_for_builder(builder).await
    }

    async fn download_byte_range(
@@ -285,8 +264,6 @@ impl RemoteStorage for AzureBlobStorage {
        let _permit = self.permit(RequestKind::Get).await;
        let blob_client = self.client.blob_client(self.relative_path_to_name(from));

-        let metadata = self.get_metadata(&blob_client).await?;
-
        let mut builder = blob_client.get();

        if let Some(end_exclusive) = end_exclusive {
@@ -301,7 +278,7 @@ impl RemoteStorage for AzureBlobStorage {
            builder = builder.range(Range::new(start_inclusive, end_exclusive));
        }

-        self.download_for_builder(metadata, builder).await
+        self.download_for_builder(builder).await
    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -1,23 +1,18 @@
 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};

 use utils::{
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
 };

-#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
-    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
-    #[serde_as(as = "DisplayFromStr")]
    pub timeline_id: TimelineId,
    pub peer_ids: Option<Vec<NodeId>>,
    pub pg_version: u32,
    pub system_id: Option<u64>,
    pub wal_seg_size: Option<u32>,
-    #[serde_as(as = "DisplayFromStr")]
    pub commit_lsn: Lsn,
    // If not passed, it is assigned to the beginning of commit_lsn segment.
    pub local_start_lsn: Option<Lsn>,
@@ -28,7 +23,6 @@ fn lsn_invalid() -> Lsn {
 }

 /// Data about safekeeper's timeline, mirrors broker.proto.
-#[serde_as]
 #[derive(Debug, Clone, Deserialize, Serialize)]
 pub struct SkTimelineInfo {
    /// Term.
@@ -36,25 +30,19 @@ pub struct SkTimelineInfo {
    /// Term of the last entry.
    pub last_log_term: Option<u64>,
    /// LSN of the last record.
-    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub flush_lsn: Lsn,
    /// Up to which LSN safekeeper regards its WAL as committed.
-    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub commit_lsn: Lsn,
    /// LSN up to which safekeeper has backed WAL.
-    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub backup_lsn: Lsn,
    /// LSN of last checkpoint uploaded by pageserver.
-    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub remote_consistent_lsn: Lsn,
-    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub peer_horizon_lsn: Lsn,
-    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub local_start_lsn: Lsn,
    /// A connection string to use for WAL receiving.
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -55,6 +55,7 @@ bytes.workspace = true
 criterion.workspace = true
 hex-literal.workspace = true
 camino-tempfile.workspace = true
+serde_assert.workspace = true

 [[bench]]
 name = "benchmarks"
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -9,7 +9,6 @@ use jsonwebtoken::{
    decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation,
 };
 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};

 use crate::id::TenantId;

@@ -32,11 +31,9 @@ pub enum Scope {
 }

 /// JWT payload. See docs/authentication.md for the format
-#[serde_as]
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
 pub struct Claims {
    #[serde(default)]
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub tenant_id: Option<TenantId>,
    pub scope: Scope,
 }
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
 ///
 /// See docs/rfcs/025-generation-numbers.md for detail on how generation
 /// numbers are used.
-#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
+#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
 pub enum Generation {
    // Generations with this magic value will not add a suffix to S3 keys, and will not
    // be included in persisted index_part.json.  This value is only to be used
--- a/libs/utils/src/hex.rs
+++ b/libs/utils/src/hex.rs
@@ -0,0 +1,41 @@
+/// Useful type for asserting that expected bytes match reporting the bytes more readable
+/// array-syntax compatible hex bytes.
+///
+/// # Usage
+///
+/// ```
+/// use utils::Hex;
+///
+/// let actual = serialize_something();
+/// let expected = [0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64];
+///
+/// // the type implements PartialEq and on mismatch, both sides are printed in 16 wide multiline
+/// // output suffixed with an array style length for easier comparisons.
+/// assert_eq!(Hex(&actual), Hex(&expected));
+///
+/// // with `let expected = [0x68];` the error would had been:
+/// // assertion `left == right` failed
+/// //  left: [0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64; 11]
+/// // right: [0x68; 1]
+/// # fn serialize_something() -> Vec<u8> { "hello world".as_bytes().to_vec() }
+/// ```
+#[derive(PartialEq)]
+pub struct Hex<'a>(pub &'a [u8]);
+
+impl std::fmt::Debug for Hex<'_> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "[")?;
+        for (i, c) in self.0.chunks(16).enumerate() {
+            if i > 0 && !c.is_empty() {
+                writeln!(f, ", ")?;
+            }
+            for (j, b) in c.iter().enumerate() {
+                if j > 0 {
+                    write!(f, ", ")?;
+                }
+                write!(f, "0x{b:02x}")?;
+            }
+        }
+        write!(f, "; {}]", self.0.len())
+    }
+}
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -14,6 +14,11 @@ use tracing::{self, debug, info, info_span, warn, Instrument};
 use std::future::Future;
 use std::str::FromStr;

+use bytes::{Bytes, BytesMut};
+use std::io::Write as _;
+use tokio::sync::mpsc;
+use tokio_stream::wrappers::ReceiverStream;
+
 static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "libmetrics_metric_handler_requests_total",
@@ -146,94 +151,89 @@ impl Drop for RequestCancelled {
    }
 }

+/// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
+pub struct ChannelWriter {
+    buffer: BytesMut,
+    pub tx: mpsc::Sender<std::io::Result<Bytes>>,
+    written: usize,
+}
+
+impl ChannelWriter {
+    pub fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
+        assert_ne!(buf_len, 0);
+        ChannelWriter {
+            // split about half off the buffer from the start, because we flush depending on
+            // capacity. first flush will come sooner than without this, but now resizes will
+            // have better chance of picking up the "other" half. not guaranteed of course.
+            buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
+            tx,
+            written: 0,
+        }
+    }
+
+    pub fn flush0(&mut self) -> std::io::Result<usize> {
+        let n = self.buffer.len();
+        if n == 0 {
+            return Ok(0);
+        }
+
+        tracing::trace!(n, "flushing");
+        let ready = self.buffer.split().freeze();
+
+        // not ideal to call from blocking code to block_on, but we are sure that this
+        // operation does not spawn_blocking other tasks
+        let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
+            self.tx.send(Ok(ready)).await.map_err(|_| ())?;
+
+            // throttle sending to allow reuse of our buffer in `write`.
+            self.tx.reserve().await.map_err(|_| ())?;
+
+            // now the response task has picked up the buffer and hopefully started
+            // sending it to the client.
+            Ok(())
+        });
+        if res.is_err() {
+            return Err(std::io::ErrorKind::BrokenPipe.into());
+        }
+        self.written += n;
+        Ok(n)
+    }
+
+    pub fn flushed_bytes(&self) -> usize {
+        self.written
+    }
+}
+
+impl std::io::Write for ChannelWriter {
+    fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
+        let remaining = self.buffer.capacity() - self.buffer.len();
+
+        let out_of_space = remaining < buf.len();
+
+        let original_len = buf.len();
+
+        if out_of_space {
+            let can_still_fit = buf.len() - remaining;
+            self.buffer.extend_from_slice(&buf[..can_still_fit]);
+            buf = &buf[can_still_fit..];
+            self.flush0()?;
+        }
+
+        // assume that this will often under normal operation just move the pointer back to the
+        // beginning of allocation, because previous split off parts are already sent and
+        // dropped.
+        self.buffer.extend_from_slice(buf);
+        Ok(original_len)
+    }
+
+    fn flush(&mut self) -> std::io::Result<()> {
+        self.flush0().map(|_| ())
+    }
+}
+
 async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    use bytes::{Bytes, BytesMut};
-    use std::io::Write as _;
-    use tokio::sync::mpsc;
-    use tokio_stream::wrappers::ReceiverStream;
-
    SERVE_METRICS_COUNT.inc();

-    /// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
-    struct ChannelWriter {
-        buffer: BytesMut,
-        tx: mpsc::Sender<std::io::Result<Bytes>>,
-        written: usize,
-    }
-
-    impl ChannelWriter {
-        fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
-            assert_ne!(buf_len, 0);
-            ChannelWriter {
-                // split about half off the buffer from the start, because we flush depending on
-                // capacity. first flush will come sooner than without this, but now resizes will
-                // have better chance of picking up the "other" half. not guaranteed of course.
-                buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
-                tx,
-                written: 0,
-            }
-        }
-
-        fn flush0(&mut self) -> std::io::Result<usize> {
-            let n = self.buffer.len();
-            if n == 0 {
-                return Ok(0);
-            }
-
-            tracing::trace!(n, "flushing");
-            let ready = self.buffer.split().freeze();
-
-            // not ideal to call from blocking code to block_on, but we are sure that this
-            // operation does not spawn_blocking other tasks
-            let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
-                self.tx.send(Ok(ready)).await.map_err(|_| ())?;
-
-                // throttle sending to allow reuse of our buffer in `write`.
-                self.tx.reserve().await.map_err(|_| ())?;
-
-                // now the response task has picked up the buffer and hopefully started
-                // sending it to the client.
-                Ok(())
-            });
-            if res.is_err() {
-                return Err(std::io::ErrorKind::BrokenPipe.into());
-            }
-            self.written += n;
-            Ok(n)
-        }
-
-        fn flushed_bytes(&self) -> usize {
-            self.written
-        }
-    }
-
-    impl std::io::Write for ChannelWriter {
-        fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
-            let remaining = self.buffer.capacity() - self.buffer.len();
-
-            let out_of_space = remaining < buf.len();
-
-            let original_len = buf.len();
-
-            if out_of_space {
-                let can_still_fit = buf.len() - remaining;
-                self.buffer.extend_from_slice(&buf[..can_still_fit]);
-                buf = &buf[can_still_fit..];
-                self.flush0()?;
-            }
-
-            // assume that this will often under normal operation just move the pointer back to the
-            // beginning of allocation, because previous split off parts are already sent and
-            // dropped.
-            self.buffer.extend_from_slice(buf);
-            Ok(original_len)
-        }
-
-        fn flush(&mut self) -> std::io::Result<()> {
-            self.flush0().map(|_| ())
-        }
-    }
-
    let started_at = std::time::Instant::now();

    let (tx, rx) = mpsc::channel(1);
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -3,6 +3,7 @@ use std::{fmt, str::FromStr};
 use anyhow::Context;
 use hex::FromHex;
 use rand::Rng;
+use serde::de::Visitor;
 use serde::{Deserialize, Serialize};
 use thiserror::Error;

@@ -17,12 +18,74 @@ pub enum IdError {
 ///
 /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
 /// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
-///
-/// Use `#[serde_as(as = "DisplayFromStr")]` to (de)serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`.
-/// Check the `serde_with::serde_as` documentation for options for more complex types.
-#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
 struct Id([u8; 16]);

+impl Serialize for Id {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            self.0.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for Id {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> Visitor<'de> for IdVisitor {
+            type Value = Id;
+
+            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 16])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 16] = Deserialize::deserialize(s)?;
+                Ok(Id::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                Id::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                16,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
+
 impl Id {
    pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id {
        let mut arr = [0u8; 16];
@@ -308,3 +371,112 @@ impl fmt::Display for NodeId {
        write!(f, "{}", self.0)
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use serde_assert::{Deserializer, Serializer, Token, Tokens};
+
+    use crate::bin_ser::BeSer;
+
+    use super::*;
+
+    #[test]
+    fn test_id_serde_non_human_readable() {
+        let original_id = Id([
+            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
+        ]);
+        let expected_tokens = Tokens(vec![
+            Token::Tuple { len: 16 },
+            Token::U8(173),
+            Token::U8(80),
+            Token::U8(132),
+            Token::U8(115),
+            Token::U8(129),
+            Token::U8(226),
+            Token::U8(72),
+            Token::U8(254),
+            Token::U8(170),
+            Token::U8(201),
+            Token::U8(135),
+            Token::U8(108),
+            Token::U8(199),
+            Token::U8(26),
+            Token::U8(228),
+            Token::U8(24),
+            Token::TupleEnd,
+        ]);
+
+        let serializer = Serializer::builder().is_human_readable(false).build();
+        let serialized_tokens = original_id.serialize(&serializer).unwrap();
+        assert_eq!(serialized_tokens, expected_tokens);
+
+        let mut deserializer = Deserializer::builder()
+            .is_human_readable(false)
+            .tokens(serialized_tokens)
+            .build();
+        let deserialized_id = Id::deserialize(&mut deserializer).unwrap();
+        assert_eq!(deserialized_id, original_id);
+    }
+
+    #[test]
+    fn test_id_serde_human_readable() {
+        let original_id = Id([
+            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
+        ]);
+        let expected_tokens = Tokens(vec![Token::Str(String::from(
+            "ad50847381e248feaac9876cc71ae418",
+        ))]);
+
+        let serializer = Serializer::builder().is_human_readable(true).build();
+        let serialized_tokens = original_id.serialize(&serializer).unwrap();
+        assert_eq!(serialized_tokens, expected_tokens);
+
+        let mut deserializer = Deserializer::builder()
+            .is_human_readable(true)
+            .tokens(Tokens(vec![Token::Str(String::from(
+                "ad50847381e248feaac9876cc71ae418",
+            ))]))
+            .build();
+        assert_eq!(Id::deserialize(&mut deserializer).unwrap(), original_id);
+    }
+
+    macro_rules! roundtrip_type {
+        ($type:ty, $expected_bytes:expr) => {{
+            let expected_bytes: [u8; 16] = $expected_bytes;
+            let original_id = <$type>::from(expected_bytes);
+
+            let ser_bytes = original_id.ser().unwrap();
+            assert_eq!(ser_bytes, expected_bytes);
+
+            let des_id = <$type>::des(&ser_bytes).unwrap();
+            assert_eq!(des_id, original_id);
+        }};
+    }
+
+    #[test]
+    fn test_id_bincode_serde() {
+        let expected_bytes = [
+            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
+        ];
+
+        roundtrip_type!(Id, expected_bytes);
+    }
+
+    #[test]
+    fn test_tenant_id_bincode_serde() {
+        let expected_bytes = [
+            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
+        ];
+
+        roundtrip_type!(TenantId, expected_bytes);
+    }
+
+    #[test]
+    fn test_timeline_id_bincode_serde() {
+        let expected_bytes = [
+            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
+        ];
+
+        roundtrip_type!(TimelineId, expected_bytes);
+    }
+}
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -24,6 +24,10 @@ pub mod auth;

 // utility functions and helper traits for unified unique id generation/serialization etc.
 pub mod id;
+
+mod hex;
+pub use hex::Hex;
+
 // http endpoint utils
 pub mod http;

@@ -74,6 +78,8 @@ pub mod completion;
 pub mod error;

 pub mod exp_counter;
+/// async timeout helper
+pub mod timeout;

 pub mod sync;

--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -1,7 +1,7 @@
 #![warn(missing_docs)]

 use camino::Utf8Path;
-use serde::{Deserialize, Serialize};
+use serde::{de::Visitor, Deserialize, Serialize};
 use std::fmt;
 use std::ops::{Add, AddAssign};
 use std::str::FromStr;
@@ -13,10 +13,114 @@ use crate::seqwait::MonotonicCounter;
 pub const XLOG_BLCKSZ: u32 = 8192;

 /// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
-#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Serialize, Deserialize)]
-#[serde(transparent)]
+#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)]
 pub struct Lsn(pub u64);

+impl Serialize for Lsn {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            self.0.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for Lsn {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct LsnVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> Visitor<'de> for LsnVisitor {
+            type Value = Lsn;
+
+            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str(
+                        "value in form of hex string({upper_u32_hex}/{lower_u32_hex}) representing u64 integer",
+                    )
+                } else {
+                    formatter.write_str("value in form of integer(u64)")
+                }
+            }
+
+            fn visit_u64<E>(self, v: u64) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                Ok(Lsn(v))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                Lsn::from_str(v).map_err(|e| E::custom(e))
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(LsnVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_u64(LsnVisitor {
+                is_human_readable_deserializer: false,
+            })
+        }
+    }
+}
+
+/// Allows (de)serialization of an `Lsn` always as `u64`.
+///
+/// ### Example
+///
+/// ```rust
+/// # use serde::{Serialize, Deserialize};
+/// use utils::lsn::Lsn;
+///
+/// #[derive(PartialEq, Serialize, Deserialize, Debug)]
+/// struct Foo {
+///   #[serde(with = "utils::lsn::serde_as_u64")]
+///   always_u64: Lsn,
+/// }
+///
+/// let orig = Foo { always_u64: Lsn(1234) };
+///
+/// let res = serde_json::to_string(&orig).unwrap();
+/// assert_eq!(res, r#"{"always_u64":1234}"#);
+///
+/// let foo = serde_json::from_str::<Foo>(&res).unwrap();
+/// assert_eq!(foo, orig);
+/// ```
+///
+pub mod serde_as_u64 {
+    use super::Lsn;
+
+    /// Serializes the Lsn as u64 disregarding the human readability of the format.
+    ///
+    /// Meant to be used via `#[serde(with = "...")]` or `#[serde(serialize_with = "...")]`.
+    pub fn serialize<S: serde::Serializer>(lsn: &Lsn, serializer: S) -> Result<S::Ok, S::Error> {
+        use serde::Serialize;
+        lsn.0.serialize(serializer)
+    }
+
+    /// Deserializes the Lsn as u64 disregarding the human readability of the format.
+    ///
+    /// Meant to be used via `#[serde(with = "...")]` or `#[serde(deserialize_with = "...")]`.
+    pub fn deserialize<'de, D: serde::Deserializer<'de>>(deserializer: D) -> Result<Lsn, D::Error> {
+        use serde::Deserialize;
+        u64::deserialize(deserializer).map(Lsn)
+    }
+}
+
 /// We tried to parse an LSN from a string, but failed
 #[derive(Debug, PartialEq, Eq, thiserror::Error)]
 #[error("LsnParseError")]
@@ -264,8 +368,13 @@ impl MonotonicCounter<Lsn> for RecordLsn {

 #[cfg(test)]
 mod tests {
+    use crate::bin_ser::BeSer;
+
    use super::*;

+    use serde::ser::Serialize;
+    use serde_assert::{Deserializer, Serializer, Token, Tokens};
+
    #[test]
    fn test_lsn_strings() {
        assert_eq!("12345678/AAAA5555".parse(), Ok(Lsn(0x12345678AAAA5555)));
@@ -341,4 +450,95 @@ mod tests {
        assert_eq!(lsn.fetch_max(Lsn(6000)), Lsn(5678));
        assert_eq!(lsn.fetch_max(Lsn(5000)), Lsn(6000));
    }
+
+    #[test]
+    fn test_lsn_serde() {
+        let original_lsn = Lsn(0x0123456789abcdef);
+        let expected_readable_tokens = Tokens(vec![Token::U64(0x0123456789abcdef)]);
+        let expected_non_readable_tokens =
+            Tokens(vec![Token::Str(String::from("1234567/89ABCDEF"))]);
+
+        // Testing human_readable ser/de
+        let serializer = Serializer::builder().is_human_readable(false).build();
+        let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
+        assert_eq!(readable_ser_tokens, expected_readable_tokens);
+
+        let mut deserializer = Deserializer::builder()
+            .is_human_readable(false)
+            .tokens(readable_ser_tokens)
+            .build();
+        let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
+        assert_eq!(des_lsn, original_lsn);
+
+        // Testing NON human_readable ser/de
+        let serializer = Serializer::builder().is_human_readable(true).build();
+        let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
+        assert_eq!(non_readable_ser_tokens, expected_non_readable_tokens);
+
+        let mut deserializer = Deserializer::builder()
+            .is_human_readable(true)
+            .tokens(non_readable_ser_tokens)
+            .build();
+        let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
+        assert_eq!(des_lsn, original_lsn);
+
+        // Testing mismatching ser/de
+        let serializer = Serializer::builder().is_human_readable(false).build();
+        let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
+
+        let mut deserializer = Deserializer::builder()
+            .is_human_readable(true)
+            .tokens(non_readable_ser_tokens)
+            .build();
+        Lsn::deserialize(&mut deserializer).unwrap_err();
+
+        let serializer = Serializer::builder().is_human_readable(true).build();
+        let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
+
+        let mut deserializer = Deserializer::builder()
+            .is_human_readable(false)
+            .tokens(readable_ser_tokens)
+            .build();
+        Lsn::deserialize(&mut deserializer).unwrap_err();
+    }
+
+    #[test]
+    fn test_lsn_ensure_roundtrip() {
+        let original_lsn = Lsn(0xaaaabbbb);
+
+        let serializer = Serializer::builder().is_human_readable(false).build();
+        let ser_tokens = original_lsn.serialize(&serializer).unwrap();
+
+        let mut deserializer = Deserializer::builder()
+            .is_human_readable(false)
+            .tokens(ser_tokens)
+            .build();
+
+        let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
+        assert_eq!(des_lsn, original_lsn);
+    }
+
+    #[test]
+    fn test_lsn_bincode_serde() {
+        let lsn = Lsn(0x0123456789abcdef);
+        let expected_bytes = [0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef];
+
+        let ser_bytes = lsn.ser().unwrap();
+        assert_eq!(ser_bytes, expected_bytes);
+
+        let des_lsn = Lsn::des(&ser_bytes).unwrap();
+        assert_eq!(des_lsn, lsn);
+    }
+
+    #[test]
+    fn test_lsn_bincode_ensure_roundtrip() {
+        let original_lsn = Lsn(0x01_02_03_04_05_06_07_08);
+        let expected_bytes = vec![0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08];
+
+        let ser_bytes = original_lsn.ser().unwrap();
+        assert_eq!(ser_bytes, expected_bytes);
+
+        let des_lsn = Lsn::des(&ser_bytes).unwrap();
+        assert_eq!(des_lsn, original_lsn);
+    }
 }
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -3,7 +3,6 @@ use std::time::{Duration, SystemTime};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use pq_proto::{read_cstr, PG_EPOCH};
 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
 use tracing::{trace, warn};

 use crate::lsn::Lsn;
@@ -15,21 +14,17 @@ use crate::lsn::Lsn;
 ///
 /// serde Serialize is used only for human readable dump to json (e.g. in
 /// safekeepers debug_dump).
-#[serde_as]
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct PageserverFeedback {
    /// Last known size of the timeline. Used to enforce timeline size limit.
    pub current_timeline_size: u64,
    /// LSN last received and ingested by the pageserver. Controls backpressure.
-    #[serde_as(as = "DisplayFromStr")]
    pub last_received_lsn: Lsn,
    /// LSN up to which data is persisted by the pageserver to its local disc.
    /// Controls backpressure.
-    #[serde_as(as = "DisplayFromStr")]
    pub disk_consistent_lsn: Lsn,
    /// LSN up to which data is persisted by the pageserver on s3; safekeepers
    /// consider WAL before it can be removed.
-    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn: Lsn,
    // Serialize with RFC3339 format.
    #[serde(with = "serde_systemtime")]
--- a/libs/utils/src/sync.rs
+++ b/libs/utils/src/sync.rs
@@ -1 +1,3 @@
 pub mod heavier_once_cell;
+
+pub mod gate;
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -0,0 +1,151 @@
+use std::{sync::Arc, time::Duration};
+
+/// Gates are a concurrency helper, primarily used for implementing safe shutdown.
+///
+/// Users of a resource call `enter()` to acquire a GateGuard, and the owner of
+/// the resource calls `close()` when they want to ensure that all holders of guards
+/// have released them, and that no future guards will be issued.
+pub struct Gate {
+    /// Each caller of enter() takes one unit from the semaphore. In close(), we
+    /// take all the units to ensure all GateGuards are destroyed.
+    sem: Arc<tokio::sync::Semaphore>,
+
+    /// For observability only: a name that will be used to log warnings if a particular
+    /// gate is holding up shutdown
+    name: String,
+}
+
+/// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
+/// not complete.
+#[derive(Debug)]
+pub struct GateGuard(tokio::sync::OwnedSemaphorePermit);
+
+/// Observability helper: every `warn_period`, emit a log warning that we're still waiting on this gate
+async fn warn_if_stuck<Fut: std::future::Future>(
+    fut: Fut,
+    name: &str,
+    warn_period: std::time::Duration,
+) -> <Fut as std::future::Future>::Output {
+    let started = std::time::Instant::now();
+
+    let mut fut = std::pin::pin!(fut);
+
+    loop {
+        match tokio::time::timeout(warn_period, &mut fut).await {
+            Ok(ret) => return ret,
+            Err(_) => {
+                tracing::warn!(
+                    gate = name,
+                    elapsed_ms = started.elapsed().as_millis(),
+                    "still waiting, taking longer than expected..."
+                );
+            }
+        }
+    }
+}
+
+#[derive(Debug)]
+pub enum GateError {
+    GateClosed,
+}
+
+impl Gate {
+    const MAX_UNITS: u32 = u32::MAX;
+
+    pub fn new(name: String) -> Self {
+        Self {
+            sem: Arc::new(tokio::sync::Semaphore::new(Self::MAX_UNITS as usize)),
+            name,
+        }
+    }
+
+    /// Acquire a guard that will prevent close() calls from completing. If close()
+    /// was already called, this will return an error which should be interpreted
+    /// as "shutting down".
+    ///
+    /// This function would typically be used from e.g. request handlers. While holding
+    /// the guard returned from this function, it is important to respect a CancellationToken
+    /// to avoid blocking close() indefinitely: typically types that contain a Gate will
+    /// also contain a CancellationToken.
+    pub fn enter(&self) -> Result<GateGuard, GateError> {
+        self.sem
+            .clone()
+            .try_acquire_owned()
+            .map(GateGuard)
+            .map_err(|_| GateError::GateClosed)
+    }
+
+    /// Types with a shutdown() method and a gate should call this method at the
+    /// end of shutdown, to ensure that all GateGuard holders are done.
+    ///
+    /// This will wait for all guards to be destroyed.  For this to complete promptly, it is
+    /// important that the holders of such guards are respecting a CancellationToken which has
+    /// been cancelled before entering this function.
+    pub async fn close(&self) {
+        warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
+    }
+
+    async fn do_close(&self) {
+        tracing::debug!(gate = self.name, "Closing Gate...");
+        match self.sem.acquire_many(Self::MAX_UNITS).await {
+            Ok(_units) => {
+                // While holding all units, close the semaphore.  All subsequent calls to enter() will fail.
+                self.sem.close();
+            }
+            Err(_) => {
+                // Semaphore closed: we are the only function that can do this, so it indicates a double-call.
+                // This is legal.  Timeline::shutdown for example is not protected from being called more than
+                // once.
+                tracing::debug!(gate = self.name, "Double close")
+            }
+        }
+        tracing::debug!(gate = self.name, "Closed Gate.")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use futures::FutureExt;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_idle_gate() {
+        // Having taken no gates, we should not be blocked in close
+        let gate = Gate::new("test".to_string());
+        gate.close().await;
+
+        // If a guard is dropped before entering, close should not be blocked
+        let gate = Gate::new("test".to_string());
+        let guard = gate.enter().unwrap();
+        drop(guard);
+        gate.close().await;
+
+        // Entering a closed guard fails
+        gate.enter().expect_err("enter should fail after close");
+    }
+
+    #[tokio::test]
+    async fn test_busy_gate() {
+        let gate = Gate::new("test".to_string());
+
+        let guard = gate.enter().unwrap();
+
+        let mut close_fut = std::pin::pin!(gate.close());
+
+        // Close should be blocked
+        assert!(close_fut.as_mut().now_or_never().is_none());
+
+        // Attempting to enter() should fail, even though close isn't done yet.
+        gate.enter()
+            .expect_err("enter should fail after entering close");
+
+        drop(guard);
+
+        // Guard is gone, close should finish
+        assert!(close_fut.as_mut().now_or_never().is_some());
+
+        // Attempting to enter() is still forbidden
+        gate.enter().expect_err("enter should fail finishing close");
+    }
+}
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -1,4 +1,7 @@
-use std::sync::{Arc, Mutex, MutexGuard};
+use std::sync::{
+    atomic::{AtomicUsize, Ordering},
+    Arc, Mutex, MutexGuard,
+};
 use tokio::sync::Semaphore;

 /// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
@@ -10,6 +13,7 @@ use tokio::sync::Semaphore;
 /// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
 pub struct OnceCell<T> {
    inner: Mutex<Inner<T>>,
+    initializers: AtomicUsize,
 }

 impl<T> Default for OnceCell<T> {
@@ -17,6 +21,7 @@ impl<T> Default for OnceCell<T> {
    fn default() -> Self {
        Self {
            inner: Default::default(),
+            initializers: AtomicUsize::new(0),
        }
    }
 }
@@ -49,6 +54,7 @@ impl<T> OnceCell<T> {
                init_semaphore: Arc::new(sem),
                value: Some(value),
            }),
+            initializers: AtomicUsize::new(0),
        }
    }

@@ -60,8 +66,8 @@ impl<T> OnceCell<T> {
    /// Initialization is panic-safe and cancellation-safe.
    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
    where
-        F: FnOnce() -> Fut,
-        Fut: std::future::Future<Output = Result<T, E>>,
+        F: FnOnce(InitPermit) -> Fut,
+        Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
    {
        let sem = {
            let guard = self.inner.lock().unwrap();
@@ -71,29 +77,61 @@ impl<T> OnceCell<T> {
            guard.init_semaphore.clone()
        };

-        let permit = sem.acquire_owned().await;
-        if permit.is_err() {
-            let guard = self.inner.lock().unwrap();
-            assert!(
-                guard.value.is_some(),
-                "semaphore got closed, must be initialized"
-            );
-            return Ok(Guard(guard));
-        } else {
-            // now we try
-            let value = factory().await?;
+        let permit = {
+            // increment the count for the duration of queued
+            let _guard = CountWaitingInitializers::start(self);
+            sem.acquire_owned().await
+        };

-            let mut guard = self.inner.lock().unwrap();
-            assert!(
-                guard.value.is_none(),
-                "we won permit, must not be initialized"
-            );
-            guard.value = Some(value);
-            guard.init_semaphore.close();
-            Ok(Guard(guard))
+        match permit {
+            Ok(permit) => {
+                let permit = InitPermit(permit);
+                let (value, _permit) = factory(permit).await?;
+
+                let guard = self.inner.lock().unwrap();
+
+                Ok(Self::set0(value, guard))
+            }
+            Err(_closed) => {
+                let guard = self.inner.lock().unwrap();
+                assert!(
+                    guard.value.is_some(),
+                    "semaphore got closed, must be initialized"
+                );
+                return Ok(Guard(guard));
+            }
        }
    }

+    /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
+    /// to complete initializing the inner value.
+    ///
+    /// # Panics
+    ///
+    /// If the inner has already been initialized.
+    pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
+        let guard = self.inner.lock().unwrap();
+
+        // cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot
+        // give more permits right now.
+        if guard.init_semaphore.try_acquire().is_ok() {
+            drop(guard);
+            panic!("permit is of wrong origin");
+        }
+
+        Self::set0(value, guard)
+    }
+
+    fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
+        if guard.value.is_some() {
+            drop(guard);
+            unreachable!("we won permit, must not be initialized");
+        }
+        guard.value = Some(value);
+        guard.init_semaphore.close();
+        Guard(guard)
+    }
+
    /// Returns a guard to an existing initialized value, if any.
    pub fn get(&self) -> Option<Guard<'_, T>> {
        let guard = self.inner.lock().unwrap();
@@ -103,6 +141,28 @@ impl<T> OnceCell<T> {
            None
        }
    }
+
+    /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
+    pub fn initializer_count(&self) -> usize {
+        self.initializers.load(Ordering::Relaxed)
+    }
+}
+
+/// DropGuard counter for queued tasks waiting to initialize, mainly accessible for the
+/// initializing task for example at the end of initialization.
+struct CountWaitingInitializers<'a, T>(&'a OnceCell<T>);
+
+impl<'a, T> CountWaitingInitializers<'a, T> {
+    fn start(target: &'a OnceCell<T>) -> Self {
+        target.initializers.fetch_add(1, Ordering::Relaxed);
+        CountWaitingInitializers(target)
+    }
+}
+
+impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
+    fn drop(&mut self) {
+        self.0.initializers.fetch_sub(1, Ordering::Relaxed);
+    }
 }

 /// Uninteresting guard object to allow short-lived access to inspect or clone the held,
@@ -135,7 +195,7 @@ impl<'a, T> Guard<'a, T> {
    ///
    /// The permit will be on a semaphore part of the new internal value, and any following
    /// [`OnceCell::get_or_init`] will wait on it to complete.
-    pub fn take_and_deinit(&mut self) -> (T, tokio::sync::OwnedSemaphorePermit) {
+    pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
        let mut swapped = Inner::default();
        let permit = swapped
            .init_semaphore
@@ -145,11 +205,14 @@ impl<'a, T> Guard<'a, T> {
        std::mem::swap(&mut *self.0, &mut swapped);
        swapped
            .value
-            .map(|v| (v, permit))
+            .map(|v| (v, InitPermit(permit)))
            .expect("guard is not created unless value has been initialized")
    }
 }

+/// Type held by OnceCell (de)initializing task.
+pub struct InitPermit(tokio::sync::OwnedSemaphorePermit);
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -185,11 +248,11 @@ mod tests {
                    barrier.wait().await;
                    let won = {
                        let g = cell
-                            .get_or_init(|| {
+                            .get_or_init(|permit| {
                                counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
                                async {
                                    counters.future_polled.fetch_add(1, Ordering::Relaxed);
-                                    Ok::<_, Infallible>(i)
+                                    Ok::<_, Infallible>((i, permit))
                                }
                            })
                            .await
@@ -243,7 +306,7 @@ mod tests {
        deinitialization_started.wait().await;

        let started_at = tokio::time::Instant::now();
-        cell.get_or_init(|| async { Ok::<_, Infallible>(reinit) })
+        cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
            .await
            .unwrap();

@@ -258,18 +321,32 @@ mod tests {
        assert_eq!(*cell.get().unwrap(), reinit);
    }

+    #[test]
+    fn reinit_with_deinit_permit() {
+        let cell = Arc::new(OnceCell::new(42));
+
+        let (mol, permit) = cell.get().unwrap().take_and_deinit();
+        cell.set(5, permit);
+        assert_eq!(*cell.get().unwrap(), 5);
+
+        let (five, permit) = cell.get().unwrap().take_and_deinit();
+        assert_eq!(5, five);
+        cell.set(mol, permit);
+        assert_eq!(*cell.get().unwrap(), 42);
+    }
+
    #[tokio::test]
    async fn initialization_attemptable_until_ok() {
        let cell = OnceCell::default();

        for _ in 0..10 {
-            cell.get_or_init(|| async { Err("whatever error") })
+            cell.get_or_init(|_permit| async { Err("whatever error") })
                .await
                .unwrap_err();
        }

        let g = cell
-            .get_or_init(|| async { Ok::<_, Infallible>("finally success") })
+            .get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
            .await
            .unwrap();
        assert_eq!(*g, "finally success");
@@ -281,11 +358,11 @@ mod tests {

        let barrier = tokio::sync::Barrier::new(2);

-        let initializer = cell.get_or_init(|| async {
+        let initializer = cell.get_or_init(|permit| async {
            barrier.wait().await;
            futures::future::pending::<()>().await;

-            Ok::<_, Infallible>("never reached")
+            Ok::<_, Infallible>(("never reached", permit))
        });

        tokio::select! {
@@ -298,7 +375,7 @@ mod tests {
        assert!(cell.get().is_none());

        let g = cell
-            .get_or_init(|| async { Ok::<_, Infallible>("now initialized") })
+            .get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
            .await
            .unwrap();
        assert_eq!(*g, "now initialized");
--- a/libs/utils/src/timeout.rs
+++ b/libs/utils/src/timeout.rs
@@ -0,0 +1,37 @@
+use std::time::Duration;
+
+use tokio_util::sync::CancellationToken;
+
+pub enum TimeoutCancellableError {
+    Timeout,
+    Cancelled,
+}
+
+/// Wrap [`tokio::time::timeout`] with a CancellationToken.
+///
+/// This wrapper is appropriate for any long running operation in a task
+/// that ought to respect a CancellationToken (which means most tasks).
+///
+/// The only time you should use a bare tokio::timeout is when the future `F`
+/// itself respects a CancellationToken: otherwise, always use this wrapper
+/// with your CancellationToken to ensure that your task does not hold up
+/// graceful shutdown.
+pub async fn timeout_cancellable<F>(
+    duration: Duration,
+    cancel: &CancellationToken,
+    future: F,
+) -> Result<F::Output, TimeoutCancellableError>
+where
+    F: std::future::Future,
+{
+    tokio::select!(
+        r = tokio::time::timeout(duration, future) => {
+            r.map_err(|_| TimeoutCancellableError::Timeout)
+
+        },
+        _ = cancel.cancelled() => {
+            Err(TimeoutCancellableError::Cancelled)
+
+        }
+    )
+}
--- a/libs/vm_monitor/src/filecache.rs
+++ b/libs/vm_monitor/src/filecache.rs
@@ -21,11 +21,6 @@ pub struct FileCacheState {

 #[derive(Debug)]
 pub struct FileCacheConfig {
-    /// Whether the file cache is *actually* stored in memory (e.g. by writing to
-    /// a tmpfs or shmem file). If true, the size of the file cache will be counted against the
-    /// memory available for the cgroup.
-    pub(crate) in_memory: bool,
-
    /// The size of the file cache, in terms of the size of the resource it consumes
    /// (currently: only memory)
    ///
@@ -59,22 +54,9 @@ pub struct FileCacheConfig {
    spread_factor: f64,
 }

-impl FileCacheConfig {
-    pub fn default_in_memory() -> Self {
+impl Default for FileCacheConfig {
+    fn default() -> Self {
        Self {
-            in_memory: true,
-            // 75 %
-            resource_multiplier: 0.75,
-            // 640 MiB; (512 + 128)
-            min_remaining_after_cache: NonZeroU64::new(640 * MiB).unwrap(),
-            // ensure any increase in file cache size is split 90-10 with 10% to other memory
-            spread_factor: 0.1,
-        }
-    }
-
-    pub fn default_on_disk() -> Self {
-        Self {
-            in_memory: false,
            resource_multiplier: 0.75,
            // 256 MiB - lower than when in memory because overcommitting is safe; if we don't have
            // memory, the kernel will just evict from its page cache, rather than e.g. killing
@@ -83,7 +65,9 @@ impl FileCacheConfig {
            spread_factor: 0.1,
        }
    }
+}

+impl FileCacheConfig {
    /// Make sure fields of the config are consistent.
    pub fn validate(&self) -> anyhow::Result<()> {
        // Single field validity
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -39,16 +39,6 @@ pub struct Args {
    #[arg(short, long)]
    pub pgconnstr: Option<String>,

-    /// Flag to signal that the Postgres file cache is on disk (i.e. not in memory aside from the
-    /// kernel's page cache), and therefore should not count against available memory.
-    //
-    // NB: Ideally this flag would directly refer to whether the file cache is in memory (rather
-    // than a roundabout way, via whether it's on disk), but in order to be backwards compatible
-    // during the switch away from an in-memory file cache, we had to default to the previous
-    // behavior.
-    #[arg(long)]
-    pub file_cache_on_disk: bool,
-
    /// The address we should listen on for connection requests. For the
    /// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
    #[arg(short, long)]
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -156,10 +156,7 @@ impl Runner {
        // memory limits.
        if let Some(connstr) = &args.pgconnstr {
            info!("initializing file cache");
-            let config = match args.file_cache_on_disk {
-                true => FileCacheConfig::default_on_disk(),
-                false => FileCacheConfig::default_in_memory(),
-            };
+            let config = FileCacheConfig::default();

            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
                .await
@@ -187,10 +184,7 @@ impl Runner {
                info!("file cache size actually got set to {actual_size}")
            }

-            if args.file_cache_on_disk {
-                file_cache_disk_size = actual_size;
-            }
-
+            file_cache_disk_size = actual_size;
            state.filecache = Some(file_cache);
        }

@@ -239,17 +233,11 @@ impl Runner {

        let requested_mem = target.mem;
        let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
-        let (expected_file_cache_size, expected_file_cache_disk_size) = self
+        let expected_file_cache_size = self
            .filecache
            .as_ref()
-            .map(|file_cache| {
-                let size = file_cache.config.calculate_cache_size(usable_system_memory);
-                match file_cache.config.in_memory {
-                    true => (size, 0),
-                    false => (size, size),
-                }
-            })
-            .unwrap_or((0, 0));
+            .map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
+            .unwrap_or(0);
        if let Some(cgroup) = &self.cgroup {
            let (last_time, last_history) = *cgroup.watcher.borrow();

@@ -273,7 +261,7 @@ impl Runner {

            let new_threshold = self
                .config
-                .cgroup_threshold(usable_system_memory, expected_file_cache_disk_size);
+                .cgroup_threshold(usable_system_memory, expected_file_cache_size);

            let current = last_history.avg_non_reclaimable;

@@ -300,13 +288,10 @@ impl Runner {
                .set_file_cache_size(expected_file_cache_size)
                .await
                .context("failed to set file cache size")?;
-            if !file_cache.config.in_memory {
-                file_cache_disk_size = actual_usage;
-            }
+            file_cache_disk_size = actual_usage;
            let message = format!(
-                "set file cache size to {} MiB (in memory = {})",
+                "set file cache size to {} MiB",
                bytes_to_mebibytes(actual_usage),
-                file_cache.config.in_memory,
            );
            info!("downscale: {message}");
            status.push(message);
@@ -357,9 +342,7 @@ impl Runner {
                .set_file_cache_size(expected_usage)
                .await
                .context("failed to set file cache size")?;
-            if !file_cache.config.in_memory {
-                file_cache_disk_size = actual_usage;
-            }
+            file_cache_disk_size = actual_usage;

            if actual_usage != expected_usage {
                warn!(