wip

2026-01-27 07:10:37 +00:00 · 2023-11-01 20:50:20 -04:00 · 2023-11-01 17:13:56 -04:00
76 changed files with 917 additions and 1983 deletions
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -17,9 +17,9 @@ assignees: ''
 ## Implementation ideas


-```[tasklist]
 ## Tasks
-```
+- [ ]
+

 ## Other related tasks and Epics
 - 
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3550,7 +3550,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -3563,7 +3563,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "native-tls",
 "tokio",
@@ -3574,7 +3574,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -3592,7 +3592,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4419,7 +4419,6 @@ dependencies = [
 "itertools",
 "pageserver",
 "rand 0.8.5",
- "remote_storage",
 "reqwest",
 "serde",
 "serde_json",
@@ -4478,7 +4477,6 @@ dependencies = [
 "tokio",
 "tokio-io-timeout",
 "tokio-postgres",
- "tokio-stream",
 "toml_edit",
 "tracing",
 "url",
@@ -4681,16 +4679,6 @@ dependencies = [
 "serde_derive",
 ]

-[[package]]
-name = "serde_assert"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eda563240c1288b044209be1f0d38bb4d15044fb3e00dc354fbc922ab4733e80"
-dependencies = [
- "hashbrown 0.13.2",
- "serde",
-]
-
 [[package]]
 name = "serde_derive"
 version = "1.0.183"
@@ -5408,7 +5396,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -5977,7 +5965,6 @@ dependencies = [
 "routerify",
 "sentry",
 "serde",
- "serde_assert",
 "serde_json",
 "serde_with",
 "signal-hook",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -124,7 +124,6 @@ sentry = { version = "0.31", default-features = false, features = ["backtrace",
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_with = "2.0"
-serde_assert = "0.5.0"
 sha2 = "0.10.2"
 signal-hook = "0.3"
 smallvec = "1.11"
@@ -162,11 +161,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -203,7 +202,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }

 ################# Binary contents sections

--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -283,6 +283,7 @@ fn main() -> Result<()> {
                .expect("--vm-monitor-addr should always be set because it has a default arg");
            let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
            let cgroup = matches.get_one::<String>("cgroup");
+            let file_cache_on_disk = matches.get_flag("file-cache-on-disk");

            // Only make a runtime if we need to.
            // Note: it seems like you can make a runtime in an inner scope and
@@ -309,6 +310,7 @@ fn main() -> Result<()> {
                        cgroup: cgroup.cloned(),
                        pgconnstr: file_cache_connstr.cloned(),
                        addr: vm_monitor_addr.clone(),
+                        file_cache_on_disk,
                    })),
                    token.clone(),
                ))
@@ -480,8 +482,6 @@ fn cli() -> clap::Command {
                .value_name("FILECACHE_CONNSTR"),
        )
        .arg(
-            // DEPRECATED, NO LONGER DOES ANYTHING.
-            // See https://github.com/neondatabase/cloud/issues/7516
            Arg::new("file-cache-on-disk")
                .long("file-cache-on-disk")
                .action(clap::ArgAction::SetTrue),
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -68,7 +68,7 @@ pub fn get_spec_from_control_plane(
    base_uri: &str,
    compute_id: &str,
 ) -> Result<Option<ComputeSpec>> {
-    let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
+    let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec");
    let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
        Ok(v) => v,
        Err(_) => "".to_string(),
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -2,6 +2,7 @@ use crate::{background_process, local_env::LocalEnv};
 use anyhow::anyhow;
 use camino::Utf8PathBuf;
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use std::{path::PathBuf, process::Child};
 use utils::id::{NodeId, TenantId};

@@ -13,8 +14,10 @@ pub struct AttachmentService {

 const COMMAND: &str = "attachment_service";

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
+    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
    pub node_id: Option<NodeId>,
 }
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -46,6 +46,7 @@ use std::time::Duration;

 use anyhow::{anyhow, bail, Context, Result};
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{NodeId, TenantId, TimelineId};

 use crate::local_env::LocalEnv;
@@ -56,10 +57,13 @@ use compute_api::responses::{ComputeState, ComputeStatus};
 use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};

 // contents of a endpoint.json file
+#[serde_as]
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct EndpointConf {
    endpoint_id: String,
+    #[serde_as(as = "DisplayFromStr")]
    tenant_id: TenantId,
+    #[serde_as(as = "DisplayFromStr")]
    timeline_id: TimelineId,
    mode: ComputeMode,
    pg_port: u16,
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -8,6 +8,7 @@ use anyhow::{bail, ensure, Context};
 use postgres_backend::AuthType;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
 use std::env;
 use std::fs;
@@ -32,6 +33,7 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
 // to 'neon_local init --config=<path>' option. See control_plane/simple.conf for
 // an example.
 //
+#[serde_as]
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct LocalEnv {
    // Base directory for all the nodes (the pageserver, safekeepers and
@@ -57,6 +59,7 @@ pub struct LocalEnv {
    // Default tenant ID to use with the 'neon_local' command line utility, when
    // --tenant_id is not explicitly specified.
    #[serde(default)]
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub default_tenant_id: Option<TenantId>,

    // used to issue tokens during e.g pg start
@@ -81,6 +84,7 @@ pub struct LocalEnv {
    // A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
    // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
    // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
+    #[serde_as(as = "HashMap<_, Vec<(DisplayFromStr, DisplayFromStr)>>")]
    branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
 }

--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -6,6 +6,7 @@
 use std::collections::HashMap;

 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

@@ -18,6 +19,7 @@ pub type PgIdent = String;

 /// Cluster spec or configuration represented as an optional number of
 /// delta operations + final cluster state description.
+#[serde_as]
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct ComputeSpec {
    pub format_version: f32,
@@ -48,12 +50,12 @@ pub struct ComputeSpec {
    // these, and instead set the "neon.tenant_id", "neon.timeline_id",
    // etc. GUCs in cluster.settings. TODO: Once the control plane has been
    // updated to fill these fields, we can make these non optional.
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub tenant_id: Option<TenantId>,
-
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub timeline_id: Option<TimelineId>,
-
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub pageserver_connstring: Option<String>,
-
    #[serde(default)]
    pub safekeeper_connstrings: Vec<String>,

@@ -138,13 +140,14 @@ impl RemoteExtSpec {
    }
 }

+#[serde_as]
 #[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
 pub enum ComputeMode {
    /// A read-write node
    #[default]
    Primary,
    /// A read-only node, pinned at a particular LSN
-    Static(Lsn),
+    Static(#[serde_as(as = "DisplayFromStr")] Lsn),
    /// A read-only node that follows the tip of the branch in hot standby mode
    ///
    /// Future versions may want to distinguish between replicas with hot standby
--- a/libs/pageserver_api/src/control_api.rs
+++ b/libs/pageserver_api/src/control_api.rs
@@ -4,6 +4,7 @@
 //! See docs/rfcs/025-generation-numbers.md

 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{NodeId, TenantId};

 #[derive(Serialize, Deserialize)]
@@ -11,8 +12,10 @@ pub struct ReAttachRequest {
    pub node_id: NodeId,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachResponseTenant {
+    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    pub gen: u32,
 }
@@ -22,8 +25,10 @@ pub struct ReAttachResponse {
    pub tenants: Vec<ReAttachResponseTenant>,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ValidateRequestTenant {
+    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    pub gen: u32,
 }
@@ -38,8 +43,10 @@ pub struct ValidateResponse {
    pub tenants: Vec<ValidateResponseTenant>,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ValidateResponseTenant {
+    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    pub valid: bool,
 }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -6,7 +6,7 @@ use std::{

 use byteorder::{BigEndian, ReadBytesExt};
 use serde::{Deserialize, Serialize};
-use serde_with::serde_as;
+use serde_with::{serde_as, DisplayFromStr};
 use strum_macros;
 use utils::{
    completion,
@@ -174,19 +174,25 @@ pub enum TimelineState {
    Broken { reason: String, backtrace: String },
 }

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
+    #[serde_as(as = "DisplayFromStr")]
    pub new_timeline_id: TimelineId,
    #[serde(default)]
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_timeline_id: Option<TimelineId>,
    #[serde(default)]
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_start_lsn: Option<Lsn>,
    pub pg_version: Option<u32>,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantCreateRequest {
+    #[serde_as(as = "DisplayFromStr")]
    pub new_tenant_id: TenantId,
    #[serde(default)]
    #[serde(skip_serializing_if = "Option::is_none")]
@@ -195,6 +201,7 @@ pub struct TenantCreateRequest {
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

+#[serde_as]
 #[derive(Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLoadRequest {
@@ -271,26 +278,31 @@ pub struct LocationConfig {
    pub tenant_conf: TenantConfig,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 #[serde(transparent)]
-pub struct TenantCreateResponse(pub TenantId);
+pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub TenantId);

 #[derive(Serialize)]
 pub struct StatusResponse {
    pub id: NodeId,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
+    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
    #[serde(flatten)]
    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

+#[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantConfigRequest {
+    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
    #[serde(flatten)]
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
@@ -362,8 +374,10 @@ pub enum TenantAttachmentStatus {
    Failed { reason: String },
 }

+#[serde_as]
 #[derive(Serialize, Deserialize, Clone)]
 pub struct TenantInfo {
+    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
    pub state: TenantState,
@@ -374,22 +388,33 @@ pub struct TenantInfo {
 }

 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
+#[serde_as]
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
+    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
+    #[serde_as(as = "DisplayFromStr")]
    pub timeline_id: TimelineId,

+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_timeline_id: Option<TimelineId>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_lsn: Option<Lsn>,
+    #[serde_as(as = "DisplayFromStr")]
    pub last_record_lsn: Lsn,
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub prev_record_lsn: Option<Lsn>,
+    #[serde_as(as = "DisplayFromStr")]
    pub latest_gc_cutoff_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
    pub disk_consistent_lsn: Lsn,

    /// The LSN that we have succesfully uploaded to remote storage
+    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn: Lsn,

    /// The LSN that we are advertizing to safekeepers
+    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn_visible: Lsn,

    pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
@@ -401,6 +426,7 @@ pub struct TimelineInfo {
    pub timeline_dir_layer_file_size_sum: Option<u64>,

    pub wal_source_connstr: Option<String>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub last_received_msg_lsn: Option<Lsn>,
    /// the timestamp (in microseconds) of the last received message
    pub last_received_msg_ts: Option<u128>,
@@ -497,13 +523,23 @@ pub struct LayerAccessStats {
    pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
 }

+#[serde_as]
 #[derive(Debug, Clone, Serialize)]
 #[serde(tag = "kind")]
 pub enum InMemoryLayerInfo {
-    Open { lsn_start: Lsn },
-    Frozen { lsn_start: Lsn, lsn_end: Lsn },
+    Open {
+        #[serde_as(as = "DisplayFromStr")]
+        lsn_start: Lsn,
+    },
+    Frozen {
+        #[serde_as(as = "DisplayFromStr")]
+        lsn_start: Lsn,
+        #[serde_as(as = "DisplayFromStr")]
+        lsn_end: Lsn,
+    },
 }

+#[serde_as]
 #[derive(Debug, Clone, Serialize)]
 #[serde(tag = "kind")]
 pub enum HistoricLayerInfo {
@@ -511,7 +547,9 @@ pub enum HistoricLayerInfo {
        layer_file_name: String,
        layer_file_size: u64,

+        #[serde_as(as = "DisplayFromStr")]
        lsn_start: Lsn,
+        #[serde_as(as = "DisplayFromStr")]
        lsn_end: Lsn,
        remote: bool,
        access_stats: LayerAccessStats,
@@ -520,6 +558,7 @@ pub enum HistoricLayerInfo {
        layer_file_name: String,
        layer_file_size: u64,

+        #[serde_as(as = "DisplayFromStr")]
        lsn_start: Lsn,
        remote: bool,
        access_stats: LayerAccessStats,
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -1,18 +1,23 @@
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};

 use utils::{
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
 };

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
+    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
+    #[serde_as(as = "DisplayFromStr")]
    pub timeline_id: TimelineId,
    pub peer_ids: Option<Vec<NodeId>>,
    pub pg_version: u32,
    pub system_id: Option<u64>,
    pub wal_seg_size: Option<u32>,
+    #[serde_as(as = "DisplayFromStr")]
    pub commit_lsn: Lsn,
    // If not passed, it is assigned to the beginning of commit_lsn segment.
    pub local_start_lsn: Option<Lsn>,
@@ -23,6 +28,7 @@ fn lsn_invalid() -> Lsn {
 }

 /// Data about safekeeper's timeline, mirrors broker.proto.
+#[serde_as]
 #[derive(Debug, Clone, Deserialize, Serialize)]
 pub struct SkTimelineInfo {
    /// Term.
@@ -30,19 +36,25 @@ pub struct SkTimelineInfo {
    /// Term of the last entry.
    pub last_log_term: Option<u64>,
    /// LSN of the last record.
+    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub flush_lsn: Lsn,
    /// Up to which LSN safekeeper regards its WAL as committed.
+    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub commit_lsn: Lsn,
    /// LSN up to which safekeeper has backed WAL.
+    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub backup_lsn: Lsn,
    /// LSN of last checkpoint uploaded by pageserver.
+    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub remote_consistent_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub peer_horizon_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub local_start_lsn: Lsn,
    /// A connection string to use for WAL receiving.
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -55,7 +55,6 @@ bytes.workspace = true
 criterion.workspace = true
 hex-literal.workspace = true
 camino-tempfile.workspace = true
-serde_assert.workspace = true

 [[bench]]
 name = "benchmarks"
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -9,6 +9,7 @@ use jsonwebtoken::{
    decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation,
 };
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};

 use crate::id::TenantId;

@@ -31,9 +32,11 @@ pub enum Scope {
 }

 /// JWT payload. See docs/authentication.md for the format
+#[serde_as]
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
 pub struct Claims {
    #[serde(default)]
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub tenant_id: Option<TenantId>,
    pub scope: Scope,
 }
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
 ///
 /// See docs/rfcs/025-generation-numbers.md for detail on how generation
 /// numbers are used.
-#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
+#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
 pub enum Generation {
    // Generations with this magic value will not add a suffix to S3 keys, and will not
    // be included in persisted index_part.json.  This value is only to be used
--- a/libs/utils/src/hex.rs
+++ b/libs/utils/src/hex.rs
@@ -1,41 +0,0 @@
-/// Useful type for asserting that expected bytes match reporting the bytes more readable
-/// array-syntax compatible hex bytes.
-///
-/// # Usage
-///
-/// ```
-/// use utils::Hex;
-///
-/// let actual = serialize_something();
-/// let expected = [0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64];
-///
-/// // the type implements PartialEq and on mismatch, both sides are printed in 16 wide multiline
-/// // output suffixed with an array style length for easier comparisons.
-/// assert_eq!(Hex(&actual), Hex(&expected));
-///
-/// // with `let expected = [0x68];` the error would had been:
-/// // assertion `left == right` failed
-/// //  left: [0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64; 11]
-/// // right: [0x68; 1]
-/// # fn serialize_something() -> Vec<u8> { "hello world".as_bytes().to_vec() }
-/// ```
-#[derive(PartialEq)]
-pub struct Hex<'a>(pub &'a [u8]);
-
-impl std::fmt::Debug for Hex<'_> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "[")?;
-        for (i, c) in self.0.chunks(16).enumerate() {
-            if i > 0 && !c.is_empty() {
-                writeln!(f, ", ")?;
-            }
-            for (j, b) in c.iter().enumerate() {
-                if j > 0 {
-                    write!(f, ", ")?;
-                }
-                write!(f, "0x{b:02x}")?;
-            }
-        }
-        write!(f, "; {}]", self.0.len())
-    }
-}
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -14,11 +14,6 @@ use tracing::{self, debug, info, info_span, warn, Instrument};
 use std::future::Future;
 use std::str::FromStr;

-use bytes::{Bytes, BytesMut};
-use std::io::Write as _;
-use tokio::sync::mpsc;
-use tokio_stream::wrappers::ReceiverStream;
-
 static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "libmetrics_metric_handler_requests_total",
@@ -151,89 +146,94 @@ impl Drop for RequestCancelled {
    }
 }

-/// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
-pub struct ChannelWriter {
-    buffer: BytesMut,
-    pub tx: mpsc::Sender<std::io::Result<Bytes>>,
-    written: usize,
-}
-
-impl ChannelWriter {
-    pub fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
-        assert_ne!(buf_len, 0);
-        ChannelWriter {
-            // split about half off the buffer from the start, because we flush depending on
-            // capacity. first flush will come sooner than without this, but now resizes will
-            // have better chance of picking up the "other" half. not guaranteed of course.
-            buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
-            tx,
-            written: 0,
-        }
-    }
-
-    pub fn flush0(&mut self) -> std::io::Result<usize> {
-        let n = self.buffer.len();
-        if n == 0 {
-            return Ok(0);
-        }
-
-        tracing::trace!(n, "flushing");
-        let ready = self.buffer.split().freeze();
-
-        // not ideal to call from blocking code to block_on, but we are sure that this
-        // operation does not spawn_blocking other tasks
-        let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
-            self.tx.send(Ok(ready)).await.map_err(|_| ())?;
-
-            // throttle sending to allow reuse of our buffer in `write`.
-            self.tx.reserve().await.map_err(|_| ())?;
-
-            // now the response task has picked up the buffer and hopefully started
-            // sending it to the client.
-            Ok(())
-        });
-        if res.is_err() {
-            return Err(std::io::ErrorKind::BrokenPipe.into());
-        }
-        self.written += n;
-        Ok(n)
-    }
-
-    pub fn flushed_bytes(&self) -> usize {
-        self.written
-    }
-}
-
-impl std::io::Write for ChannelWriter {
-    fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
-        let remaining = self.buffer.capacity() - self.buffer.len();
-
-        let out_of_space = remaining < buf.len();
-
-        let original_len = buf.len();
-
-        if out_of_space {
-            let can_still_fit = buf.len() - remaining;
-            self.buffer.extend_from_slice(&buf[..can_still_fit]);
-            buf = &buf[can_still_fit..];
-            self.flush0()?;
-        }
-
-        // assume that this will often under normal operation just move the pointer back to the
-        // beginning of allocation, because previous split off parts are already sent and
-        // dropped.
-        self.buffer.extend_from_slice(buf);
-        Ok(original_len)
-    }
-
-    fn flush(&mut self) -> std::io::Result<()> {
-        self.flush0().map(|_| ())
-    }
-}
-
 async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    use bytes::{Bytes, BytesMut};
+    use std::io::Write as _;
+    use tokio::sync::mpsc;
+    use tokio_stream::wrappers::ReceiverStream;
+
    SERVE_METRICS_COUNT.inc();

+    /// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
+    struct ChannelWriter {
+        buffer: BytesMut,
+        tx: mpsc::Sender<std::io::Result<Bytes>>,
+        written: usize,
+    }
+
+    impl ChannelWriter {
+        fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
+            assert_ne!(buf_len, 0);
+            ChannelWriter {
+                // split about half off the buffer from the start, because we flush depending on
+                // capacity. first flush will come sooner than without this, but now resizes will
+                // have better chance of picking up the "other" half. not guaranteed of course.
+                buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
+                tx,
+                written: 0,
+            }
+        }
+
+        fn flush0(&mut self) -> std::io::Result<usize> {
+            let n = self.buffer.len();
+            if n == 0 {
+                return Ok(0);
+            }
+
+            tracing::trace!(n, "flushing");
+            let ready = self.buffer.split().freeze();
+
+            // not ideal to call from blocking code to block_on, but we are sure that this
+            // operation does not spawn_blocking other tasks
+            let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
+                self.tx.send(Ok(ready)).await.map_err(|_| ())?;
+
+                // throttle sending to allow reuse of our buffer in `write`.
+                self.tx.reserve().await.map_err(|_| ())?;
+
+                // now the response task has picked up the buffer and hopefully started
+                // sending it to the client.
+                Ok(())
+            });
+            if res.is_err() {
+                return Err(std::io::ErrorKind::BrokenPipe.into());
+            }
+            self.written += n;
+            Ok(n)
+        }
+
+        fn flushed_bytes(&self) -> usize {
+            self.written
+        }
+    }
+
+    impl std::io::Write for ChannelWriter {
+        fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
+            let remaining = self.buffer.capacity() - self.buffer.len();
+
+            let out_of_space = remaining < buf.len();
+
+            let original_len = buf.len();
+
+            if out_of_space {
+                let can_still_fit = buf.len() - remaining;
+                self.buffer.extend_from_slice(&buf[..can_still_fit]);
+                buf = &buf[can_still_fit..];
+                self.flush0()?;
+            }
+
+            // assume that this will often under normal operation just move the pointer back to the
+            // beginning of allocation, because previous split off parts are already sent and
+            // dropped.
+            self.buffer.extend_from_slice(buf);
+            Ok(original_len)
+        }
+
+        fn flush(&mut self) -> std::io::Result<()> {
+            self.flush0().map(|_| ())
+        }
+    }
+
    let started_at = std::time::Instant::now();

    let (tx, rx) = mpsc::channel(1);
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -3,7 +3,6 @@ use std::{fmt, str::FromStr};
 use anyhow::Context;
 use hex::FromHex;
 use rand::Rng;
-use serde::de::Visitor;
 use serde::{Deserialize, Serialize};
 use thiserror::Error;

@@ -18,74 +17,12 @@ pub enum IdError {
 ///
 /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
 /// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
-#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
+///
+/// Use `#[serde_as(as = "DisplayFromStr")]` to (de)serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`.
+/// Check the `serde_with::serde_as` documentation for options for more complex types.
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
 struct Id([u8; 16]);

-impl Serialize for Id {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if serializer.is_human_readable() {
-            serializer.collect_str(self)
-        } else {
-            self.0.serialize(serializer)
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for Id {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        struct IdVisitor {
-            is_human_readable_deserializer: bool,
-        }
-
-        impl<'de> Visitor<'de> for IdVisitor {
-            type Value = Id;
-
-            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
-                if self.is_human_readable_deserializer {
-                    formatter.write_str("value in form of hex string")
-                } else {
-                    formatter.write_str("value in form of integer array([u8; 16])")
-                }
-            }
-
-            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::SeqAccess<'de>,
-            {
-                let s = serde::de::value::SeqAccessDeserializer::new(seq);
-                let id: [u8; 16] = Deserialize::deserialize(s)?;
-                Ok(Id::from(id))
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                Id::from_str(v).map_err(E::custom)
-            }
-        }
-
-        if deserializer.is_human_readable() {
-            deserializer.deserialize_str(IdVisitor {
-                is_human_readable_deserializer: true,
-            })
-        } else {
-            deserializer.deserialize_tuple(
-                16,
-                IdVisitor {
-                    is_human_readable_deserializer: false,
-                },
-            )
-        }
-    }
-}
-
 impl Id {
    pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id {
        let mut arr = [0u8; 16];
@@ -371,112 +308,3 @@ impl fmt::Display for NodeId {
        write!(f, "{}", self.0)
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use serde_assert::{Deserializer, Serializer, Token, Tokens};
-
-    use crate::bin_ser::BeSer;
-
-    use super::*;
-
-    #[test]
-    fn test_id_serde_non_human_readable() {
-        let original_id = Id([
-            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
-        ]);
-        let expected_tokens = Tokens(vec![
-            Token::Tuple { len: 16 },
-            Token::U8(173),
-            Token::U8(80),
-            Token::U8(132),
-            Token::U8(115),
-            Token::U8(129),
-            Token::U8(226),
-            Token::U8(72),
-            Token::U8(254),
-            Token::U8(170),
-            Token::U8(201),
-            Token::U8(135),
-            Token::U8(108),
-            Token::U8(199),
-            Token::U8(26),
-            Token::U8(228),
-            Token::U8(24),
-            Token::TupleEnd,
-        ]);
-
-        let serializer = Serializer::builder().is_human_readable(false).build();
-        let serialized_tokens = original_id.serialize(&serializer).unwrap();
-        assert_eq!(serialized_tokens, expected_tokens);
-
-        let mut deserializer = Deserializer::builder()
-            .is_human_readable(false)
-            .tokens(serialized_tokens)
-            .build();
-        let deserialized_id = Id::deserialize(&mut deserializer).unwrap();
-        assert_eq!(deserialized_id, original_id);
-    }
-
-    #[test]
-    fn test_id_serde_human_readable() {
-        let original_id = Id([
-            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
-        ]);
-        let expected_tokens = Tokens(vec![Token::Str(String::from(
-            "ad50847381e248feaac9876cc71ae418",
-        ))]);
-
-        let serializer = Serializer::builder().is_human_readable(true).build();
-        let serialized_tokens = original_id.serialize(&serializer).unwrap();
-        assert_eq!(serialized_tokens, expected_tokens);
-
-        let mut deserializer = Deserializer::builder()
-            .is_human_readable(true)
-            .tokens(Tokens(vec![Token::Str(String::from(
-                "ad50847381e248feaac9876cc71ae418",
-            ))]))
-            .build();
-        assert_eq!(Id::deserialize(&mut deserializer).unwrap(), original_id);
-    }
-
-    macro_rules! roundtrip_type {
-        ($type:ty, $expected_bytes:expr) => {{
-            let expected_bytes: [u8; 16] = $expected_bytes;
-            let original_id = <$type>::from(expected_bytes);
-
-            let ser_bytes = original_id.ser().unwrap();
-            assert_eq!(ser_bytes, expected_bytes);
-
-            let des_id = <$type>::des(&ser_bytes).unwrap();
-            assert_eq!(des_id, original_id);
-        }};
-    }
-
-    #[test]
-    fn test_id_bincode_serde() {
-        let expected_bytes = [
-            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
-        ];
-
-        roundtrip_type!(Id, expected_bytes);
-    }
-
-    #[test]
-    fn test_tenant_id_bincode_serde() {
-        let expected_bytes = [
-            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
-        ];
-
-        roundtrip_type!(TenantId, expected_bytes);
-    }
-
-    #[test]
-    fn test_timeline_id_bincode_serde() {
-        let expected_bytes = [
-            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
-        ];
-
-        roundtrip_type!(TimelineId, expected_bytes);
-    }
-}
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -24,10 +24,6 @@ pub mod auth;

 // utility functions and helper traits for unified unique id generation/serialization etc.
 pub mod id;
-
-mod hex;
-pub use hex::Hex;
-
 // http endpoint utils
 pub mod http;

--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -1,7 +1,7 @@
 #![warn(missing_docs)]

 use camino::Utf8Path;
-use serde::{de::Visitor, Deserialize, Serialize};
+use serde::{Deserialize, Serialize};
 use std::fmt;
 use std::ops::{Add, AddAssign};
 use std::str::FromStr;
@@ -13,114 +13,10 @@ use crate::seqwait::MonotonicCounter;
 pub const XLOG_BLCKSZ: u32 = 8192;

 /// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
-#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)]
+#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Serialize, Deserialize)]
+#[serde(transparent)]
 pub struct Lsn(pub u64);

-impl Serialize for Lsn {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if serializer.is_human_readable() {
-            serializer.collect_str(self)
-        } else {
-            self.0.serialize(serializer)
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for Lsn {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        struct LsnVisitor {
-            is_human_readable_deserializer: bool,
-        }
-
-        impl<'de> Visitor<'de> for LsnVisitor {
-            type Value = Lsn;
-
-            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
-                if self.is_human_readable_deserializer {
-                    formatter.write_str(
-                        "value in form of hex string({upper_u32_hex}/{lower_u32_hex}) representing u64 integer",
-                    )
-                } else {
-                    formatter.write_str("value in form of integer(u64)")
-                }
-            }
-
-            fn visit_u64<E>(self, v: u64) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                Ok(Lsn(v))
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                Lsn::from_str(v).map_err(|e| E::custom(e))
-            }
-        }
-
-        if deserializer.is_human_readable() {
-            deserializer.deserialize_str(LsnVisitor {
-                is_human_readable_deserializer: true,
-            })
-        } else {
-            deserializer.deserialize_u64(LsnVisitor {
-                is_human_readable_deserializer: false,
-            })
-        }
-    }
-}
-
-/// Allows (de)serialization of an `Lsn` always as `u64`.
-///
-/// ### Example
-///
-/// ```rust
-/// # use serde::{Serialize, Deserialize};
-/// use utils::lsn::Lsn;
-///
-/// #[derive(PartialEq, Serialize, Deserialize, Debug)]
-/// struct Foo {
-///   #[serde(with = "utils::lsn::serde_as_u64")]
-///   always_u64: Lsn,
-/// }
-///
-/// let orig = Foo { always_u64: Lsn(1234) };
-///
-/// let res = serde_json::to_string(&orig).unwrap();
-/// assert_eq!(res, r#"{"always_u64":1234}"#);
-///
-/// let foo = serde_json::from_str::<Foo>(&res).unwrap();
-/// assert_eq!(foo, orig);
-/// ```
-///
-pub mod serde_as_u64 {
-    use super::Lsn;
-
-    /// Serializes the Lsn as u64 disregarding the human readability of the format.
-    ///
-    /// Meant to be used via `#[serde(with = "...")]` or `#[serde(serialize_with = "...")]`.
-    pub fn serialize<S: serde::Serializer>(lsn: &Lsn, serializer: S) -> Result<S::Ok, S::Error> {
-        use serde::Serialize;
-        lsn.0.serialize(serializer)
-    }
-
-    /// Deserializes the Lsn as u64 disregarding the human readability of the format.
-    ///
-    /// Meant to be used via `#[serde(with = "...")]` or `#[serde(deserialize_with = "...")]`.
-    pub fn deserialize<'de, D: serde::Deserializer<'de>>(deserializer: D) -> Result<Lsn, D::Error> {
-        use serde::Deserialize;
-        u64::deserialize(deserializer).map(Lsn)
-    }
-}
-
 /// We tried to parse an LSN from a string, but failed
 #[derive(Debug, PartialEq, Eq, thiserror::Error)]
 #[error("LsnParseError")]
@@ -368,13 +264,8 @@ impl MonotonicCounter<Lsn> for RecordLsn {

 #[cfg(test)]
 mod tests {
-    use crate::bin_ser::BeSer;
-
    use super::*;

-    use serde::ser::Serialize;
-    use serde_assert::{Deserializer, Serializer, Token, Tokens};
-
    #[test]
    fn test_lsn_strings() {
        assert_eq!("12345678/AAAA5555".parse(), Ok(Lsn(0x12345678AAAA5555)));
@@ -450,95 +341,4 @@ mod tests {
        assert_eq!(lsn.fetch_max(Lsn(6000)), Lsn(5678));
        assert_eq!(lsn.fetch_max(Lsn(5000)), Lsn(6000));
    }
-
-    #[test]
-    fn test_lsn_serde() {
-        let original_lsn = Lsn(0x0123456789abcdef);
-        let expected_readable_tokens = Tokens(vec![Token::U64(0x0123456789abcdef)]);
-        let expected_non_readable_tokens =
-            Tokens(vec![Token::Str(String::from("1234567/89ABCDEF"))]);
-
-        // Testing human_readable ser/de
-        let serializer = Serializer::builder().is_human_readable(false).build();
-        let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
-        assert_eq!(readable_ser_tokens, expected_readable_tokens);
-
-        let mut deserializer = Deserializer::builder()
-            .is_human_readable(false)
-            .tokens(readable_ser_tokens)
-            .build();
-        let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
-        assert_eq!(des_lsn, original_lsn);
-
-        // Testing NON human_readable ser/de
-        let serializer = Serializer::builder().is_human_readable(true).build();
-        let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
-        assert_eq!(non_readable_ser_tokens, expected_non_readable_tokens);
-
-        let mut deserializer = Deserializer::builder()
-            .is_human_readable(true)
-            .tokens(non_readable_ser_tokens)
-            .build();
-        let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
-        assert_eq!(des_lsn, original_lsn);
-
-        // Testing mismatching ser/de
-        let serializer = Serializer::builder().is_human_readable(false).build();
-        let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
-
-        let mut deserializer = Deserializer::builder()
-            .is_human_readable(true)
-            .tokens(non_readable_ser_tokens)
-            .build();
-        Lsn::deserialize(&mut deserializer).unwrap_err();
-
-        let serializer = Serializer::builder().is_human_readable(true).build();
-        let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
-
-        let mut deserializer = Deserializer::builder()
-            .is_human_readable(false)
-            .tokens(readable_ser_tokens)
-            .build();
-        Lsn::deserialize(&mut deserializer).unwrap_err();
-    }
-
-    #[test]
-    fn test_lsn_ensure_roundtrip() {
-        let original_lsn = Lsn(0xaaaabbbb);
-
-        let serializer = Serializer::builder().is_human_readable(false).build();
-        let ser_tokens = original_lsn.serialize(&serializer).unwrap();
-
-        let mut deserializer = Deserializer::builder()
-            .is_human_readable(false)
-            .tokens(ser_tokens)
-            .build();
-
-        let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
-        assert_eq!(des_lsn, original_lsn);
-    }
-
-    #[test]
-    fn test_lsn_bincode_serde() {
-        let lsn = Lsn(0x0123456789abcdef);
-        let expected_bytes = [0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef];
-
-        let ser_bytes = lsn.ser().unwrap();
-        assert_eq!(ser_bytes, expected_bytes);
-
-        let des_lsn = Lsn::des(&ser_bytes).unwrap();
-        assert_eq!(des_lsn, lsn);
-    }
-
-    #[test]
-    fn test_lsn_bincode_ensure_roundtrip() {
-        let original_lsn = Lsn(0x01_02_03_04_05_06_07_08);
-        let expected_bytes = vec![0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08];
-
-        let ser_bytes = original_lsn.ser().unwrap();
-        assert_eq!(ser_bytes, expected_bytes);
-
-        let des_lsn = Lsn::des(&ser_bytes).unwrap();
-        assert_eq!(des_lsn, original_lsn);
-    }
 }
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -3,6 +3,7 @@ use std::time::{Duration, SystemTime};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use pq_proto::{read_cstr, PG_EPOCH};
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use tracing::{trace, warn};

 use crate::lsn::Lsn;
@@ -14,17 +15,21 @@ use crate::lsn::Lsn;
 ///
 /// serde Serialize is used only for human readable dump to json (e.g. in
 /// safekeepers debug_dump).
+#[serde_as]
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct PageserverFeedback {
    /// Last known size of the timeline. Used to enforce timeline size limit.
    pub current_timeline_size: u64,
    /// LSN last received and ingested by the pageserver. Controls backpressure.
+    #[serde_as(as = "DisplayFromStr")]
    pub last_received_lsn: Lsn,
    /// LSN up to which data is persisted by the pageserver to its local disc.
    /// Controls backpressure.
+    #[serde_as(as = "DisplayFromStr")]
    pub disk_consistent_lsn: Lsn,
    /// LSN up to which data is persisted by the pageserver on s3; safekeepers
    /// consider WAL before it can be removed.
+    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn: Lsn,
    // Serialize with RFC3339 format.
    #[serde(with = "serde_systemtime")]
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -1,7 +1,4 @@
-use std::sync::{
-    atomic::{AtomicUsize, Ordering},
-    Arc, Mutex, MutexGuard,
-};
+use std::sync::{Arc, Mutex, MutexGuard};
 use tokio::sync::Semaphore;

 /// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
@@ -13,7 +10,6 @@ use tokio::sync::Semaphore;
 /// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
 pub struct OnceCell<T> {
    inner: Mutex<Inner<T>>,
-    initializers: AtomicUsize,
 }

 impl<T> Default for OnceCell<T> {
@@ -21,7 +17,6 @@ impl<T> Default for OnceCell<T> {
    fn default() -> Self {
        Self {
            inner: Default::default(),
-            initializers: AtomicUsize::new(0),
        }
    }
 }
@@ -54,7 +49,6 @@ impl<T> OnceCell<T> {
                init_semaphore: Arc::new(sem),
                value: Some(value),
            }),
-            initializers: AtomicUsize::new(0),
        }
    }

@@ -66,8 +60,8 @@ impl<T> OnceCell<T> {
    /// Initialization is panic-safe and cancellation-safe.
    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
    where
-        F: FnOnce(InitPermit) -> Fut,
-        Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
+        F: FnOnce() -> Fut,
+        Fut: std::future::Future<Output = Result<T, E>>,
    {
        let sem = {
            let guard = self.inner.lock().unwrap();
@@ -77,61 +71,29 @@ impl<T> OnceCell<T> {
            guard.init_semaphore.clone()
        };

-        let permit = {
-            // increment the count for the duration of queued
-            let _guard = CountWaitingInitializers::start(self);
-            sem.acquire_owned().await
-        };
+        let permit = sem.acquire_owned().await;
+        if permit.is_err() {
+            let guard = self.inner.lock().unwrap();
+            assert!(
+                guard.value.is_some(),
+                "semaphore got closed, must be initialized"
+            );
+            return Ok(Guard(guard));
+        } else {
+            // now we try
+            let value = factory().await?;

-        match permit {
-            Ok(permit) => {
-                let permit = InitPermit(permit);
-                let (value, _permit) = factory(permit).await?;
-
-                let guard = self.inner.lock().unwrap();
-
-                Ok(Self::set0(value, guard))
-            }
-            Err(_closed) => {
-                let guard = self.inner.lock().unwrap();
-                assert!(
-                    guard.value.is_some(),
-                    "semaphore got closed, must be initialized"
-                );
-                return Ok(Guard(guard));
-            }
+            let mut guard = self.inner.lock().unwrap();
+            assert!(
+                guard.value.is_none(),
+                "we won permit, must not be initialized"
+            );
+            guard.value = Some(value);
+            guard.init_semaphore.close();
+            Ok(Guard(guard))
        }
    }

-    /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
-    /// to complete initializing the inner value.
-    ///
-    /// # Panics
-    ///
-    /// If the inner has already been initialized.
-    pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
-        let guard = self.inner.lock().unwrap();
-
-        // cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot
-        // give more permits right now.
-        if guard.init_semaphore.try_acquire().is_ok() {
-            drop(guard);
-            panic!("permit is of wrong origin");
-        }
-
-        Self::set0(value, guard)
-    }
-
-    fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
-        if guard.value.is_some() {
-            drop(guard);
-            unreachable!("we won permit, must not be initialized");
-        }
-        guard.value = Some(value);
-        guard.init_semaphore.close();
-        Guard(guard)
-    }
-
    /// Returns a guard to an existing initialized value, if any.
    pub fn get(&self) -> Option<Guard<'_, T>> {
        let guard = self.inner.lock().unwrap();
@@ -141,28 +103,6 @@ impl<T> OnceCell<T> {
            None
        }
    }
-
-    /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
-    pub fn initializer_count(&self) -> usize {
-        self.initializers.load(Ordering::Relaxed)
-    }
-}
-
-/// DropGuard counter for queued tasks waiting to initialize, mainly accessible for the
-/// initializing task for example at the end of initialization.
-struct CountWaitingInitializers<'a, T>(&'a OnceCell<T>);
-
-impl<'a, T> CountWaitingInitializers<'a, T> {
-    fn start(target: &'a OnceCell<T>) -> Self {
-        target.initializers.fetch_add(1, Ordering::Relaxed);
-        CountWaitingInitializers(target)
-    }
-}
-
-impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
-    fn drop(&mut self) {
-        self.0.initializers.fetch_sub(1, Ordering::Relaxed);
-    }
 }

 /// Uninteresting guard object to allow short-lived access to inspect or clone the held,
@@ -195,7 +135,7 @@ impl<'a, T> Guard<'a, T> {
    ///
    /// The permit will be on a semaphore part of the new internal value, and any following
    /// [`OnceCell::get_or_init`] will wait on it to complete.
-    pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
+    pub fn take_and_deinit(&mut self) -> (T, tokio::sync::OwnedSemaphorePermit) {
        let mut swapped = Inner::default();
        let permit = swapped
            .init_semaphore
@@ -205,14 +145,11 @@ impl<'a, T> Guard<'a, T> {
        std::mem::swap(&mut *self.0, &mut swapped);
        swapped
            .value
-            .map(|v| (v, InitPermit(permit)))
+            .map(|v| (v, permit))
            .expect("guard is not created unless value has been initialized")
    }
 }

-/// Type held by OnceCell (de)initializing task.
-pub struct InitPermit(tokio::sync::OwnedSemaphorePermit);
-
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -248,11 +185,11 @@ mod tests {
                    barrier.wait().await;
                    let won = {
                        let g = cell
-                            .get_or_init(|permit| {
+                            .get_or_init(|| {
                                counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
                                async {
                                    counters.future_polled.fetch_add(1, Ordering::Relaxed);
-                                    Ok::<_, Infallible>((i, permit))
+                                    Ok::<_, Infallible>(i)
                                }
                            })
                            .await
@@ -306,7 +243,7 @@ mod tests {
        deinitialization_started.wait().await;

        let started_at = tokio::time::Instant::now();
-        cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
+        cell.get_or_init(|| async { Ok::<_, Infallible>(reinit) })
            .await
            .unwrap();

@@ -321,32 +258,18 @@ mod tests {
        assert_eq!(*cell.get().unwrap(), reinit);
    }

-    #[test]
-    fn reinit_with_deinit_permit() {
-        let cell = Arc::new(OnceCell::new(42));
-
-        let (mol, permit) = cell.get().unwrap().take_and_deinit();
-        cell.set(5, permit);
-        assert_eq!(*cell.get().unwrap(), 5);
-
-        let (five, permit) = cell.get().unwrap().take_and_deinit();
-        assert_eq!(5, five);
-        cell.set(mol, permit);
-        assert_eq!(*cell.get().unwrap(), 42);
-    }
-
    #[tokio::test]
    async fn initialization_attemptable_until_ok() {
        let cell = OnceCell::default();

        for _ in 0..10 {
-            cell.get_or_init(|_permit| async { Err("whatever error") })
+            cell.get_or_init(|| async { Err("whatever error") })
                .await
                .unwrap_err();
        }

        let g = cell
-            .get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
+            .get_or_init(|| async { Ok::<_, Infallible>("finally success") })
            .await
            .unwrap();
        assert_eq!(*g, "finally success");
@@ -358,11 +281,11 @@ mod tests {

        let barrier = tokio::sync::Barrier::new(2);

-        let initializer = cell.get_or_init(|permit| async {
+        let initializer = cell.get_or_init(|| async {
            barrier.wait().await;
            futures::future::pending::<()>().await;

-            Ok::<_, Infallible>(("never reached", permit))
+            Ok::<_, Infallible>("never reached")
        });

        tokio::select! {
@@ -375,7 +298,7 @@ mod tests {
        assert!(cell.get().is_none());

        let g = cell
-            .get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
+            .get_or_init(|| async { Ok::<_, Infallible>("now initialized") })
            .await
            .unwrap();
        assert_eq!(*g, "now initialized");
--- a/libs/vm_monitor/src/filecache.rs
+++ b/libs/vm_monitor/src/filecache.rs
@@ -21,6 +21,11 @@ pub struct FileCacheState {

 #[derive(Debug)]
 pub struct FileCacheConfig {
+    /// Whether the file cache is *actually* stored in memory (e.g. by writing to
+    /// a tmpfs or shmem file). If true, the size of the file cache will be counted against the
+    /// memory available for the cgroup.
+    pub(crate) in_memory: bool,
+
    /// The size of the file cache, in terms of the size of the resource it consumes
    /// (currently: only memory)
    ///
@@ -54,9 +59,22 @@ pub struct FileCacheConfig {
    spread_factor: f64,
 }

-impl Default for FileCacheConfig {
-    fn default() -> Self {
+impl FileCacheConfig {
+    pub fn default_in_memory() -> Self {
        Self {
+            in_memory: true,
+            // 75 %
+            resource_multiplier: 0.75,
+            // 640 MiB; (512 + 128)
+            min_remaining_after_cache: NonZeroU64::new(640 * MiB).unwrap(),
+            // ensure any increase in file cache size is split 90-10 with 10% to other memory
+            spread_factor: 0.1,
+        }
+    }
+
+    pub fn default_on_disk() -> Self {
+        Self {
+            in_memory: false,
            resource_multiplier: 0.75,
            // 256 MiB - lower than when in memory because overcommitting is safe; if we don't have
            // memory, the kernel will just evict from its page cache, rather than e.g. killing
@@ -65,9 +83,7 @@ impl Default for FileCacheConfig {
            spread_factor: 0.1,
        }
    }
-}

-impl FileCacheConfig {
    /// Make sure fields of the config are consistent.
    pub fn validate(&self) -> anyhow::Result<()> {
        // Single field validity
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -39,6 +39,16 @@ pub struct Args {
    #[arg(short, long)]
    pub pgconnstr: Option<String>,

+    /// Flag to signal that the Postgres file cache is on disk (i.e. not in memory aside from the
+    /// kernel's page cache), and therefore should not count against available memory.
+    //
+    // NB: Ideally this flag would directly refer to whether the file cache is in memory (rather
+    // than a roundabout way, via whether it's on disk), but in order to be backwards compatible
+    // during the switch away from an in-memory file cache, we had to default to the previous
+    // behavior.
+    #[arg(long)]
+    pub file_cache_on_disk: bool,
+
    /// The address we should listen on for connection requests. For the
    /// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
    #[arg(short, long)]
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -156,7 +156,10 @@ impl Runner {
        // memory limits.
        if let Some(connstr) = &args.pgconnstr {
            info!("initializing file cache");
-            let config = FileCacheConfig::default();
+            let config = match args.file_cache_on_disk {
+                true => FileCacheConfig::default_on_disk(),
+                false => FileCacheConfig::default_in_memory(),
+            };

            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
                .await
@@ -184,7 +187,10 @@ impl Runner {
                info!("file cache size actually got set to {actual_size}")
            }

-            file_cache_disk_size = actual_size;
+            if args.file_cache_on_disk {
+                file_cache_disk_size = actual_size;
+            }
+
            state.filecache = Some(file_cache);
        }

@@ -233,11 +239,17 @@ impl Runner {

        let requested_mem = target.mem;
        let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
-        let expected_file_cache_size = self
+        let (expected_file_cache_size, expected_file_cache_disk_size) = self
            .filecache
            .as_ref()
-            .map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
-            .unwrap_or(0);
+            .map(|file_cache| {
+                let size = file_cache.config.calculate_cache_size(usable_system_memory);
+                match file_cache.config.in_memory {
+                    true => (size, 0),
+                    false => (size, size),
+                }
+            })
+            .unwrap_or((0, 0));
        if let Some(cgroup) = &self.cgroup {
            let (last_time, last_history) = *cgroup.watcher.borrow();

@@ -261,7 +273,7 @@ impl Runner {

            let new_threshold = self
                .config
-                .cgroup_threshold(usable_system_memory, expected_file_cache_size);
+                .cgroup_threshold(usable_system_memory, expected_file_cache_disk_size);

            let current = last_history.avg_non_reclaimable;

@@ -288,10 +300,13 @@ impl Runner {
                .set_file_cache_size(expected_file_cache_size)
                .await
                .context("failed to set file cache size")?;
-            file_cache_disk_size = actual_usage;
+            if !file_cache.config.in_memory {
+                file_cache_disk_size = actual_usage;
+            }
            let message = format!(
-                "set file cache size to {} MiB",
+                "set file cache size to {} MiB (in memory = {})",
                bytes_to_mebibytes(actual_usage),
+                file_cache.config.in_memory,
            );
            info!("downscale: {message}");
            status.push(message);
@@ -342,7 +357,9 @@ impl Runner {
                .set_file_cache_size(expected_usage)
                .await
                .context("failed to set file cache size")?;
-            file_cache_disk_size = actual_usage;
+            if !file_cache.config.in_memory {
+                file_cache_disk_size = actual_usage;
+            }

            if actual_usage != expected_usage {
                warn!(
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -88,6 +88,10 @@ criterion.workspace = true
 hex-literal.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }

+[[bench]]
+name = "bench_writes"
+harness = false
+
 [[bench]]
 name = "bench_layer_map"
 harness = false
--- a/pageserver/benches/README.md
+++ b/pageserver/benches/README.md
@@ -10,3 +10,7 @@ To run a specific file:

 To run a specific function:
 `cargo bench --bench bench_layer_map -- real_map_uniform_queries`
+
+To add a new benchmark:
+1. Create new file containing `criterion_main!`
+2. Add it to `Cargo.toml`
--- a/pageserver/benches/bench_writes.rs
+++ b/pageserver/benches/bench_writes.rs
@@ -0,0 +1,76 @@
+use bytes::{Bytes, BytesMut};
+use camino::{Utf8Path, Utf8PathBuf};
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use pageserver::{tenant::storage_layer::InMemoryLayer, config::PageServerConf, context::{RequestContext, DownloadBehavior}, task_mgr::TaskKind, repository::Key, virtual_file};
+use pageserver::repository::Value;
+use utils::{id::{TimelineId, TenantId}, lsn::Lsn};
+
+fn bench_writes(c: &mut Criterion) {
+    // Boilerplate
+    // TODO this setup can be avoided if I reuse TenantHarness but it's difficult
+    //      because it's only compiled for tests, and it's hacky because tbh we
+    //      shouldn't need this many inputs for a function that just writes bytes
+    //      from memory to disk. Performance-critical functions should be
+    //      self-contained (almost like they're separate libraries) and all the
+    //      monolithic pageserver machinery should live outside.
+    virtual_file::init(10);
+    let repo_dir = Utf8PathBuf::from(&"/home/bojan/tmp/repo_dir");
+    let conf = PageServerConf::dummy_conf(repo_dir);
+    let conf: &'static PageServerConf = Box::leak(Box::new(conf));
+    let timeline_id = TimelineId::generate();
+    let tenant_id = TenantId::generate();
+    let start_lsn = Lsn(0);
+    let ctx = RequestContext::new(TaskKind::LayerFlushTask, DownloadBehavior::Error);
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .unwrap();
+
+    fn test_img(s: &str) -> Bytes {
+        let mut buf = BytesMut::new();
+        buf.extend_from_slice(s.as_bytes());
+        buf.resize(64, 0);
+
+        buf.freeze()
+    }
+
+    // Make the InMemoryLayer that will be flushed
+    let layer = rt.block_on(async {
+        let l = InMemoryLayer::create(&conf, timeline_id, tenant_id, start_lsn).await.unwrap();
+
+        let mut lsn = Lsn(0x10);
+        let mut key = Key::from_hex("012222222233333333444444445500000000").unwrap();
+        let mut blknum = 0;
+        for _ in 0..100 {
+            key.field6 = blknum;
+            let val = Value::Image(test_img(&format!("{} at {}", blknum, lsn)));
+            l.put_value(key, lsn, &val, &ctx).await.unwrap();
+
+            lsn = Lsn(lsn.0 + 0x10);
+            blknum += 1;
+        }
+        l
+    });
+
+    rt.block_on(async {
+        layer.write_to_disk_bench(&ctx).await.unwrap();
+    });
+
+
+    let mut group = c.benchmark_group("g1");
+    group.bench_function("f1", |b| {
+        b.iter(|| {
+            // TODO
+        });
+    });
+    group.bench_function("f2", |b| {
+        b.iter(|| {
+            // TODO
+        });
+    });
+    group.finish();
+}
+
+
+criterion_group!(group_1, bench_writes);
+criterion_main!(group_1);
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -3,6 +3,7 @@ use anyhow::Context;
 use chrono::{DateTime, Utc};
 use consumption_metrics::EventType;
 use futures::stream::StreamExt;
+use serde_with::serde_as;
 use std::{sync::Arc, time::SystemTime};
 use utils::{
    id::{TenantId, TimelineId},
@@ -41,10 +42,13 @@ pub(super) enum Name {
 ///
 /// This is a denormalization done at the MetricsKey const methods; these should not be constructed
 /// elsewhere.
+#[serde_with::serde_as]
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
 pub(crate) struct MetricsKey {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub(super) tenant_id: TenantId,

+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
    #[serde(skip_serializing_if = "Option::is_none")]
    pub(super) timeline_id: Option<TimelineId>,

--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -1,4 +1,5 @@
 use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
+use serde_with::serde_as;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;

@@ -6,9 +7,12 @@ use super::{metrics::Name, Cache, MetricsKey, RawMetric};
 use utils::id::{TenantId, TimelineId};

 /// How the metrics from pageserver are identified.
+#[serde_with::serde_as]
 #[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq)]
 struct Ids {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub(super) tenant_id: TenantId,
+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
    #[serde(skip_serializing_if = "Option::is_none")]
    pub(super) timeline_id: Option<TimelineId>,
 }
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -10,7 +10,6 @@ use crate::control_plane_client::ControlPlaneGenerationsApi;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::remote_timeline_path;
-use crate::virtual_file::MaybeFatalIo;
 use crate::virtual_file::VirtualFile;
 use anyhow::Context;
 use camino::Utf8PathBuf;
@@ -18,6 +17,7 @@ use hex::FromHex;
 use remote_storage::{GenericRemoteStorage, RemotePath};
 use serde::Deserialize;
 use serde::Serialize;
+use serde_with::serde_as;
 use thiserror::Error;
 use tokio;
 use tokio_util::sync::CancellationToken;
@@ -214,6 +214,7 @@ where
 /// during recovery as startup.
 const TEMP_SUFFIX: &str = "tmp";

+#[serde_as]
 #[derive(Debug, Serialize, Deserialize)]
 struct DeletionList {
    /// Serialization version, for future use
@@ -242,6 +243,7 @@ struct DeletionList {
    validated: bool,
 }

+#[serde_as]
 #[derive(Debug, Serialize, Deserialize)]
 struct DeletionHeader {
    /// Serialization version, for future use
@@ -269,9 +271,7 @@ impl DeletionHeader {
        let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
            .await
-            .maybe_fatal_err("save deletion header")?;
-
-        Ok(())
+            .map_err(Into::into)
    }
 }

@@ -360,7 +360,6 @@ impl DeletionList {
        let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
        VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
            .await
-            .maybe_fatal_err("save deletion list")
            .map_err(Into::into)
    }
 }
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -34,8 +34,6 @@ use crate::deletion_queue::TEMP_SUFFIX;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::storage_layer::LayerFileName;
-use crate::virtual_file::on_fatal_io_error;
-use crate::virtual_file::MaybeFatalIo;

 // The number of keys in a DeletionList before we will proactively persist it
 // (without reaching a flush deadline).  This aims to deliver objects of the order
@@ -197,7 +195,7 @@ impl ListWriter {
                    debug!("Deletion header {header_path} not found, first start?");
                    Ok(None)
                } else {
-                    on_fatal_io_error(&e, "reading deletion header");
+                    Err(anyhow::anyhow!(e))
                }
            }
        }
@@ -218,9 +216,16 @@ impl ListWriter {
        self.pending.sequence = validated_sequence + 1;

        let deletion_directory = self.conf.deletion_prefix();
-        let mut dir = tokio::fs::read_dir(&deletion_directory)
-            .await
-            .fatal_err("read deletion directory");
+        let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
+            Ok(d) => d,
+            Err(e) => {
+                warn!("Failed to open deletion list directory {deletion_directory}: {e:#}");
+
+                // Give up: if we can't read the deletion list directory, we probably can't
+                // write lists into it later, so the queue won't work.
+                return Err(e.into());
+            }
+        };

        let list_name_pattern =
            Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
@@ -228,7 +233,7 @@ impl ListWriter {
        let temp_extension = format!(".{TEMP_SUFFIX}");
        let header_path = self.conf.deletion_header_path();
        let mut seqs: Vec<u64> = Vec::new();
-        while let Some(dentry) = dir.next_entry().await.fatal_err("read deletion dentry") {
+        while let Some(dentry) = dir.next_entry().await? {
            let file_name = dentry.file_name();
            let dentry_str = file_name.to_string_lossy();

@@ -241,9 +246,11 @@ impl ListWriter {
                info!("Cleaning up temporary file {dentry_str}");
                let absolute_path =
                    deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path"));
-                tokio::fs::remove_file(&absolute_path)
-                    .await
-                    .fatal_err("delete temp file");
+                if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
+                    // Non-fatal error: we will just leave the file behind but not
+                    // try and load it.
+                    warn!("Failed to clean up temporary file {absolute_path}: {e:#}");
+                }

                continue;
            }
@@ -283,9 +290,7 @@ impl ListWriter {
        for s in seqs {
            let list_path = self.conf.deletion_list_path(s);

-            let list_bytes = tokio::fs::read(&list_path)
-                .await
-                .fatal_err("read deletion list");
+            let list_bytes = tokio::fs::read(&list_path).await?;

            let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
                Ok(l) => l,
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -28,7 +28,6 @@ use crate::config::PageServerConf;
 use crate::control_plane_client::ControlPlaneGenerationsApi;
 use crate::control_plane_client::RetryForeverError;
 use crate::metrics;
-use crate::virtual_file::MaybeFatalIo;

 use super::deleter::DeleterMessage;
 use super::DeletionHeader;
@@ -288,9 +287,16 @@ where
    async fn cleanup_lists(&mut self, list_paths: Vec<Utf8PathBuf>) {
        for list_path in list_paths {
            debug!("Removing deletion list {list_path}");
-            tokio::fs::remove_file(&list_path)
-                .await
-                .fatal_err("remove deletion list");
+
+            if let Err(e) = tokio::fs::remove_file(&list_path).await {
+                // Unexpected: we should have permissions and nothing else should
+                // be touching these files.  We will leave the file behind.  Subsequent
+                // pageservers will try and load it again: hopefully whatever storage
+                // issue (probably permissions) has been fixed by then.
+                tracing::error!("Failed to delete {list_path}: {e:#}");
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
+                break;
+            }
        }
    }

--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -17,6 +17,7 @@ use pageserver_api::models::{
    TenantLoadRequest, TenantLocationConfigRequest,
 };
 use remote_storage::GenericRemoteStorage;
+use serde_with::{serde_as, DisplayFromStr};
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -498,8 +499,10 @@ async fn get_lsn_by_timestamp_handler(
    let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;

    if version.unwrap_or(0) > 1 {
+        #[serde_as]
        #[derive(serde::Serialize)]
        struct Result {
+            #[serde_as(as = "DisplayFromStr")]
            lsn: Lsn,
            kind: &'static str,
        }
@@ -808,8 +811,10 @@ async fn tenant_size_handler(
    }

    /// The type resides in the pageserver not to expose `ModelInputs`.
+    #[serde_with::serde_as]
    #[derive(serde::Serialize)]
    struct TenantHistorySize {
+        #[serde_as(as = "serde_with::DisplayFromStr")]
        id: TenantId,
        /// Size is a mixture of WAL and logical size, so the unit is bytes.
        ///
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -406,123 +406,4 @@ mod tests {
            METADATA_OLD_FORMAT_VERSION, METADATA_FORMAT_VERSION
        );
    }
-
-    #[test]
-    fn test_metadata_bincode_serde() {
-        let original_metadata = TimelineMetadata::new(
-            Lsn(0x200),
-            Some(Lsn(0x100)),
-            Some(TIMELINE_ID),
-            Lsn(0),
-            Lsn(0),
-            Lsn(0),
-            // Any version will do here, so use the default
-            crate::DEFAULT_PG_VERSION,
-        );
-        let metadata_bytes = original_metadata
-            .to_bytes()
-            .expect("Cannot create bytes array from metadata");
-
-        let metadata_bincode_be_bytes = original_metadata
-            .ser()
-            .expect("Cannot serialize the metadata");
-
-        // 8 bytes for the length of the vector
-        assert_eq!(metadata_bincode_be_bytes.len(), 8 + metadata_bytes.len());
-
-        let expected_bincode_bytes = {
-            let mut temp = vec![];
-            let len_bytes = metadata_bytes.len().to_be_bytes();
-            temp.extend_from_slice(&len_bytes);
-            temp.extend_from_slice(&metadata_bytes);
-            temp
-        };
-        assert_eq!(metadata_bincode_be_bytes, expected_bincode_bytes);
-
-        let deserialized_metadata = TimelineMetadata::des(&metadata_bincode_be_bytes).unwrap();
-        // Deserialized metadata has the metadata header, which is different from the serialized one.
-        //   Reference: TimelineMetaData::to_bytes()
-        let expected_metadata = {
-            let mut temp_metadata = original_metadata;
-            let body_bytes = temp_metadata
-                .body
-                .ser()
-                .expect("Cannot serialize the metadata body");
-            let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
-            let hdr = TimelineMetadataHeader {
-                size: metadata_size as u16,
-                format_version: METADATA_FORMAT_VERSION,
-                checksum: crc32c::crc32c(&body_bytes),
-            };
-            temp_metadata.hdr = hdr;
-            temp_metadata
-        };
-        assert_eq!(deserialized_metadata, expected_metadata);
-    }
-
-    #[test]
-    fn test_metadata_bincode_serde_ensure_roundtrip() {
-        let original_metadata = TimelineMetadata::new(
-            Lsn(0x200),
-            Some(Lsn(0x100)),
-            Some(TIMELINE_ID),
-            Lsn(0),
-            Lsn(0),
-            Lsn(0),
-            // Any version will do here, so use the default
-            crate::DEFAULT_PG_VERSION,
-        );
-        let expected_bytes = vec![
-            /* bincode length encoding bytes */
-            0, 0, 0, 0, 0, 0, 2, 0, // 8 bytes for the length of the serialized vector
-            /* TimelineMetadataHeader */
-            4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
-            /* TimelineMetadataBodyV2 */
-            0, 0, 0, 0, 0, 0, 2, 0, // disk_consistent_lsn (8 bytes)
-            1, 0, 0, 0, 0, 0, 0, 1, 0, // prev_record_lsn (9 bytes)
-            1, 17, 34, 51, 68, 85, 102, 119, 136, 17, 34, 51, 68, 85, 102, 119,
-            136, // ancestor_timeline (17 bytes)
-            0, 0, 0, 0, 0, 0, 0, 0, // ancestor_lsn (8 bytes)
-            0, 0, 0, 0, 0, 0, 0, 0, // latest_gc_cutoff_lsn (8 bytes)
-            0, 0, 0, 0, 0, 0, 0, 0, // initdb_lsn (8 bytes)
-            0, 0, 0, 15, // pg_version (4 bytes)
-            /* padding bytes */
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0,
-        ];
-        let metadata_ser_bytes = original_metadata.ser().unwrap();
-        assert_eq!(metadata_ser_bytes, expected_bytes);
-
-        let expected_metadata = {
-            let mut temp_metadata = original_metadata;
-            let body_bytes = temp_metadata
-                .body
-                .ser()
-                .expect("Cannot serialize the metadata body");
-            let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
-            let hdr = TimelineMetadataHeader {
-                size: metadata_size as u16,
-                format_version: METADATA_FORMAT_VERSION,
-                checksum: crc32c::crc32c(&body_bytes),
-            };
-            temp_metadata.hdr = hdr;
-            temp_metadata
-        };
-        let des_metadata = TimelineMetadata::des(&metadata_ser_bytes).unwrap();
-        assert_eq!(des_metadata, expected_metadata);
-    }
 }
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1542,7 +1542,7 @@ pub fn remote_index_path(
 }

 /// Given the key of an index, parse out the generation part of the name
-pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
+pub(crate) fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
    let file_name = match path.get_path().file_name() {
        Some(f) => f,
        None => {
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -6,6 +6,7 @@ use std::collections::HashMap;

 use chrono::NaiveDateTime;
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use utils::bin_ser::SerializeError;

 use crate::tenant::metadata::TimelineMetadata;
@@ -57,6 +58,7 @@ impl LayerFileMetadata {
 ///
 /// This type needs to be backwards and forwards compatible. When changing the fields,
 /// remember to add a test case for the changed version.
+#[serde_as]
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub struct IndexPart {
    /// Debugging aid describing the version of this type.
@@ -76,6 +78,7 @@ pub struct IndexPart {
    // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
    // It's duplicated for convenience when reading the serialized structure, but is
    // private because internally we would read from metadata instead.
+    #[serde_as(as = "DisplayFromStr")]
    disk_consistent_lsn: Lsn,

    #[serde(rename = "metadata_bytes")]
@@ -152,7 +155,7 @@ pub struct IndexLayerMetadata {

    #[serde(default = "Generation::none")]
    #[serde(skip_serializing_if = "Generation::is_none")]
-    pub generation: Generation,
+    pub(super) generation: Generation,
 }

 impl From<LayerFileMetadata> for IndexLayerMetadata {
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -29,6 +29,7 @@ use tenant_size_model::{Segment, StorageModel};
 /// needs. We will convert this into a StorageModel when it's time to perform
 /// the calculation.
 ///
+#[serde_with::serde_as]
 #[derive(Debug, serde::Serialize, serde::Deserialize)]
 pub struct ModelInputs {
    pub segments: Vec<SegmentMeta>,
@@ -36,9 +37,11 @@ pub struct ModelInputs {
 }

 /// A [`Segment`], with some extra information for display purposes
+#[serde_with::serde_as]
 #[derive(Debug, serde::Serialize, serde::Deserialize)]
 pub struct SegmentMeta {
    pub segment: Segment,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub timeline_id: TimelineId,
    pub kind: LsnKind,
 }
@@ -74,22 +77,32 @@ pub enum LsnKind {

 /// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as
 /// part of [`ModelInputs`] from the HTTP api, explaining the inputs.
+#[serde_with::serde_as]
 #[derive(Debug, serde::Serialize, serde::Deserialize)]
 pub struct TimelineInputs {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub timeline_id: TimelineId,

+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
    pub ancestor_id: Option<TimelineId>,

+    #[serde_as(as = "serde_with::DisplayFromStr")]
    ancestor_lsn: Lsn,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    last_record: Lsn,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    latest_gc_cutoff: Lsn,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    horizon_cutoff: Lsn,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    pitr_cutoff: Lsn,

    /// Cutoff point based on GC settings
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    next_gc_cutoff: Lsn,

    /// Cutoff point calculated from the user-supplied 'max_retention_period'
+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
    retention_param_cutoff: Option<Lsn>,
 }

--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -4,6 +4,7 @@ pub mod delta_layer;
 mod filename;
 mod image_layer;
 mod inmemory_layer;
+mod inmemory_layer_raw;
 mod layer;
 mod layer_desc;

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -367,4 +367,61 @@ impl InMemoryLayer {
        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
        Ok(delta_layer)
    }
+
+    /// Write this frozen in-memory layer to disk.
+    ///
+    /// Returns a new delta layer with all the same data as this in-memory layer
+    pub async fn write_to_disk_bench(
+        &self,
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        // Grab the lock in read-mode. We hold it over the I/O, but because this
+        // layer is not writeable anymore, no one should be trying to acquire the
+        // write lock on it, so we shouldn't block anyone. There's one exception
+        // though: another thread might have grabbed a reference to this layer
+        // in `get_layer_for_write' just before the checkpointer called
+        // `freeze`, and then `write_to_disk` on it. When the thread gets the
+        // lock, it will see that it's not writeable anymore and retry, but it
+        // would have to wait until we release it. That race condition is very
+        // rare though, so we just accept the potential latency hit for now.
+        let inner = self.inner.read().await;
+
+        let end_lsn = *self.end_lsn.get().unwrap();
+
+        let mut delta_layer_writer = DeltaLayerWriter::new(
+            self.conf,
+            self.timeline_id,
+            self.tenant_id,
+            Key::MIN,
+            self.start_lsn..end_lsn,
+        )
+        .await?;
+
+        let mut buf = Vec::new();
+
+        let cursor = inner.file.block_cursor();
+
+        let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
+        keys.sort_by_key(|k| k.0);
+
+        let ctx = RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::InMemoryLayer)
+            .build();
+        for (key, vec_map) in keys.iter() {
+            let key = **key;
+            // Write all page versions
+            for (lsn, pos) in vec_map.as_slice() {
+                cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
+                let will_init = Value::des(&buf)?.will_init();
+                delta_layer_writer
+                    .put_value_bytes(key, *lsn, &buf, will_init)
+                    .await?;
+            }
+        }
+
+        // MAX is used here because we identify L0 layers by full key range
+        // TODO XXX do this
+        // let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
+        Ok(())
+    }
 }
--- a/pageserver/src/tenant/storage_layer/inmemory_layer_raw.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer_raw.rs
@@ -0,0 +1,23 @@
+
+
+pub struct InMemoryLayerRaw {
+}
+
+impl InMemoryLayerRaw {
+    pub async fn new() -> Self {
+        Self {
+
+        }
+    }
+
+    pub async fn put_value(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        val: &Value,
+        ctx: &RequestContext,
+    ) -> Result<()> {
+
+        Ok(())
+    }
+}
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -337,39 +337,31 @@ enum ResidentOrWantedEvicted {
 }

 impl ResidentOrWantedEvicted {
-    fn get_and_upgrade(&mut self) -> Option<(Arc<DownloadedLayer>, bool)> {
+    fn get(&self) -> Option<Arc<DownloadedLayer>> {
        match self {
-            ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)),
+            ResidentOrWantedEvicted::Resident(strong) => Some(strong.clone()),
            ResidentOrWantedEvicted::WantedEvicted(weak, _) => match weak.upgrade() {
                Some(strong) => {
                    LAYER_IMPL_METRICS.inc_raced_wanted_evicted_accesses();
-
-                    *self = ResidentOrWantedEvicted::Resident(strong.clone());
-
-                    Some((strong, true))
+                    Some(strong)
                }
                None => None,
            },
        }
    }
-
    /// When eviction is first requested, drop down to holding a [`Weak`].
    ///
-    /// Returns `Some` if this was the first time eviction was requested. Care should be taken to
-    /// drop the possibly last strong reference outside of the mutex of
-    /// heavier_once_cell::OnceCell.
-    fn downgrade(&mut self) -> Option<Arc<DownloadedLayer>> {
+    /// Returns `true` if this was the first time eviction was requested.
+    fn downgrade(&mut self) -> bool {
        match self {
            ResidentOrWantedEvicted::Resident(strong) => {
                let weak = Arc::downgrade(strong);
-                let mut temp = ResidentOrWantedEvicted::WantedEvicted(weak, strong.version);
-                std::mem::swap(self, &mut temp);
-                match temp {
-                    ResidentOrWantedEvicted::Resident(strong) => Some(strong),
-                    ResidentOrWantedEvicted::WantedEvicted(..) => unreachable!("just swapped"),
-                }
+                *self = ResidentOrWantedEvicted::WantedEvicted(weak, strong.version);
+                // returning the weak is not useful, because the drop could had already ran with
+                // the replacement above, and that will take care of cleaning the Option we are in
+                true
            }
-            ResidentOrWantedEvicted::WantedEvicted(..) => None,
+            ResidentOrWantedEvicted::WantedEvicted(..) => false,
        }
    }
 }
@@ -411,10 +403,6 @@ struct LayerInner {
    version: AtomicUsize,

    /// Allow subscribing to when the layer actually gets evicted.
-    ///
-    /// If in future we need to implement "wait until layer instances are gone and done", carrying
-    /// this over to the gc spawn_blocking from LayerInner::drop will do the trick, and adding a
-    /// method for "wait_gc" which will wait to this being closed.
    status: tokio::sync::broadcast::Sender<Status>,

    /// Counter for exponential backoff with the download
@@ -565,8 +553,6 @@ impl LayerInner {
        }
    }

-    /// Cancellation safe, however dropping the future and calling this method again might result
-    /// in a new attempt to evict OR join the previously started attempt.
    pub(crate) async fn evict_and_wait(
        &self,
        _: &RemoteTimelineClient,
@@ -577,22 +563,20 @@ impl LayerInner {

        let mut rx = self.status.subscribe();

-        let strong = {
-            match self.inner.get() {
-                Some(mut either) => {
-                    self.wanted_evicted.store(true, Ordering::Relaxed);
-                    either.downgrade()
-                }
-                None => return Err(EvictionError::NotFound),
-            }
-        };
+        let res =
+            self.wanted_evicted
+                .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);

-        if strong.is_some() {
-            // drop the DownloadedLayer outside of the holding the guard
-            drop(strong);
+        if res.is_ok() {
            LAYER_IMPL_METRICS.inc_started_evictions();
        }

+        if self.get().is_none() {
+            // it was not evictable in the first place
+            // our store to the wanted_evicted does not matter; it will be reset by next download
+            return Err(EvictionError::NotFound);
+        }
+
        match rx.recv().await {
            Ok(Status::Evicted) => Ok(()),
            Ok(Status::Downloaded) => Err(EvictionError::Downloaded),
@@ -606,8 +590,7 @@ impl LayerInner {
                //
                // use however late (compared to the initial expressing of wanted) as the
                // "outcome" now
-                LAYER_IMPL_METRICS.inc_broadcast_lagged();
-                match self.inner.get() {
+                match self.get() {
                    Some(_) => Err(EvictionError::Downloaded),
                    None => Ok(()),
                }
@@ -615,17 +598,15 @@ impl LayerInner {
        }
    }

-    /// Cancellation safe.
-    #[tracing::instrument(skip_all, fields(layer=%self))]
+    /// Should be cancellation safe, but cancellation is troublesome together with the spawned
+    /// download.
    async fn get_or_maybe_download(
        self: &Arc<Self>,
        allow_download: bool,
        ctx: Option<&RequestContext>,
    ) -> Result<Arc<DownloadedLayer>, DownloadError> {
-        let mut init_permit = None;
-
        loop {
-            let download = move |permit| async move {
+            let download = move || async move {
                // disable any scheduled but not yet running eviction deletions for this
                let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);

@@ -646,11 +627,7 @@ impl LayerInner {
                    .await
                    .map_err(DownloadError::PreStatFailed)?;

-                let permit = if let Some(reason) = needs_download {
-                    if let NeedsDownload::NotFile(ft) = reason {
-                        return Err(DownloadError::NotFile(ft));
-                    }
-
+                if let Some(reason) = needs_download {
                    // only reset this after we've decided we really need to download. otherwise it'd
                    // be impossible to mark cancelled downloads for eviction, like one could imagine
                    // we would like to do for prefetching which was not needed.
@@ -660,6 +637,8 @@ impl LayerInner {
                        return Err(DownloadError::NoRemoteStorage);
                    }

+                    tracing::debug!(%reason, "downloading layer");
+
                    if let Some(ctx) = ctx {
                        self.check_expected_download(ctx)?;
                    }
@@ -670,16 +649,12 @@ impl LayerInner {
                        return Err(DownloadError::DownloadRequired);
                    }

-                    tracing::info!(%reason, "downloading on-demand");
-
-                    self.spawn_download_and_wait(timeline, permit).await?
+                    self.spawn_download_and_wait(timeline).await?;
                } else {
                    // the file is present locally, probably by a previous but cancelled call to
                    // get_or_maybe_download. alternatively we might be running without remote storage.
                    LAYER_IMPL_METRICS.inc_init_needed_no_download();
-
-                    permit
-                };
+                }

                let res = Arc::new(DownloadedLayer {
                    owner: Arc::downgrade(self),
@@ -692,60 +667,19 @@ impl LayerInner {
                    LayerResidenceEventReason::ResidenceChange,
                );

-                let waiters = self.inner.initializer_count();
-                if waiters > 0 {
-                    tracing::info!(waiters, "completing the on-demand download for other tasks");
-                }
-
-                Ok((ResidentOrWantedEvicted::Resident(res), permit))
+                Ok(ResidentOrWantedEvicted::Resident(res))
            };

-            if let Some(init_permit) = init_permit.take() {
-                // use the already held initialization permit because it is impossible to hit the
-                // below paths anymore essentially limiting the max loop iterations to 2.
-                let (value, init_permit) = download(init_permit).await?;
-                let mut guard = self.inner.set(value, init_permit);
-                let (strong, _upgraded) = guard
-                    .get_and_upgrade()
-                    .expect("init creates strong reference, we held the init permit");
+            let locked = self.inner.get_or_init(download).await?;
+
+            if let Some(strong) = Self::get_or_apply_evictedness(Some(locked), &self.wanted_evicted)
+            {
                return Ok(strong);
            }

-            let (weak, permit) = {
-                let mut locked = self.inner.get_or_init(download).await?;
-
-                if let Some((strong, upgraded)) = locked.get_and_upgrade() {
-                    if upgraded {
-                        // when upgraded back, the Arc<DownloadedLayer> is still available, but
-                        // previously a `evict_and_wait` was received.
-                        self.wanted_evicted.store(false, Ordering::Relaxed);
-
-                        // error out any `evict_and_wait`
-                        drop(self.status.send(Status::Downloaded));
-                        LAYER_IMPL_METRICS
-                            .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);
-                    }
-
-                    return Ok(strong);
-                } else {
-                    // path to here: the evict_blocking is stuck on spawn_blocking queue.
-                    //
-                    // reset the contents, deactivating the eviction and causing a
-                    // EvictionCancelled::LostToDownload or EvictionCancelled::VersionCheckFailed.
-                    locked.take_and_deinit()
-                }
-            };
-
-            // unlock first, then drop the weak, but because upgrade failed, we
-            // know it cannot be a problem.
-
-            assert!(
-                matches!(weak, ResidentOrWantedEvicted::WantedEvicted(..)),
-                "unexpected {weak:?}, ResidentOrWantedEvicted::get_and_upgrade has a bug"
-            );
-
-            init_permit = Some(permit);
-
+            // the situation in which we might need to retry is that our init was ready
+            // immediatedly, but the DownloadedLayer had been dropped BUT failed to complete
+            // Self::evict_blocking
            LAYER_IMPL_METRICS.inc_retried_get_or_maybe_download();
        }
    }
@@ -757,8 +691,8 @@ impl LayerInner {
        match b {
            Download => Ok(()),
            Warn | Error => {
-                tracing::info!(
-                    "unexpectedly on-demand downloading for task kind {:?}",
+                tracing::warn!(
+                    "unexpectedly on-demand downloading remote layer {self} for task kind {:?}",
                    ctx.task_kind()
                );
                crate::metrics::UNEXPECTED_ONDEMAND_DOWNLOADS.inc();
@@ -780,17 +714,14 @@ impl LayerInner {
    async fn spawn_download_and_wait(
        self: &Arc<Self>,
        timeline: Arc<Timeline>,
-        permit: heavier_once_cell::InitPermit,
-    ) -> Result<heavier_once_cell::InitPermit, DownloadError> {
+    ) -> Result<(), DownloadError> {
        let task_name = format!("download layer {}", self);

        let (tx, rx) = tokio::sync::oneshot::channel();
-
        // this is sadly needed because of task_mgr::shutdown_tasks, otherwise we cannot
        // block tenant::mgr::remove_tenant_from_memory.

        let this: Arc<Self> = self.clone();
-
        crate::task_mgr::spawn(
            &tokio::runtime::Handle::current(),
            crate::task_mgr::TaskKind::RemoteDownloadTask,
@@ -799,7 +730,6 @@ impl LayerInner {
            &task_name,
            false,
            async move {
-
                let client = timeline
                    .remote_client
                    .as_ref()
@@ -821,9 +751,9 @@ impl LayerInner {
                    }
                };

-                if let Err(res) = tx.send((result, permit)) {
+                if let Err(res) = tx.send(result) {
                    match res {
-                        (Ok(()), _) => {
+                        Ok(()) => {
                            // our caller is cancellation safe so this is fine; if someone
                            // else requests the layer, they'll find it already downloaded
                            // or redownload.
@@ -834,7 +764,7 @@ impl LayerInner {
                            tracing::info!("layer file download completed after requester had cancelled");
                            LAYER_IMPL_METRICS.inc_download_completed_without_requester();
                        },
-                        (Err(e), _) => {
+                        Err(e) => {
                            // our caller is cancellation safe, but we might be racing with
                            // another attempt to initialize. before we have cancellation
                            // token support: these attempts should converge regardless of
@@ -850,7 +780,7 @@ impl LayerInner {
            .in_current_span(),
        );
        match rx.await {
-            Ok((Ok(()), permit)) => {
+            Ok(Ok(())) => {
                if let Some(reason) = self
                    .needs_download()
                    .await
@@ -861,12 +791,10 @@ impl LayerInner {
                }

                self.consecutive_failures.store(0, Ordering::Relaxed);
-                tracing::info!("on-demand download successful");

-                Ok(permit)
+                Ok(())
            }
-            Ok((Err(e), _permit)) => {
-                // FIXME: this should be with the spawned task and be cancellation sensitive
+            Ok(Err(e)) => {
                let consecutive_failures =
                    self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
                tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
@@ -884,6 +812,33 @@ impl LayerInner {
        }
    }

+    /// Access the current state without waiting for the file to be downloaded.
+    ///
+    /// Requires that we've initialized to state which is respective to the
+    /// actual residency state.
+    fn get(&self) -> Option<Arc<DownloadedLayer>> {
+        let locked = self.inner.get();
+        Self::get_or_apply_evictedness(locked, &self.wanted_evicted)
+    }
+
+    fn get_or_apply_evictedness(
+        guard: Option<heavier_once_cell::Guard<'_, ResidentOrWantedEvicted>>,
+        wanted_evicted: &AtomicBool,
+    ) -> Option<Arc<DownloadedLayer>> {
+        if let Some(mut x) = guard {
+            if let Some(won) = x.get() {
+                // there are no guarantees that we will always get to observe a concurrent call
+                // to evict
+                if wanted_evicted.load(Ordering::Acquire) {
+                    x.downgrade();
+                }
+                return Some(won);
+            }
+        }
+
+        None
+    }
+
    async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
        match tokio::fs::metadata(&self.path).await {
            Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
@@ -903,7 +858,7 @@ impl LayerInner {
    fn is_file_present_and_good_size(&self, m: &std::fs::Metadata) -> Result<(), NeedsDownload> {
        // in future, this should include sha2-256 validation of the file.
        if !m.is_file() {
-            Err(NeedsDownload::NotFile(m.file_type()))
+            Err(NeedsDownload::NotFile)
        } else if m.len() != self.desc.file_size {
            Err(NeedsDownload::WrongSize {
                actual: m.len(),
@@ -917,9 +872,7 @@ impl LayerInner {
    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
        let layer_file_name = self.desc.filename().file_name();

-        // this is not accurate: we could have the file locally but there was a cancellation
-        // and now we are not in sync, or we are currently downloading it.
-        let remote = self.inner.get().is_none();
+        let remote = self.get().is_none();

        let access_stats = self.access_stats.as_api_model(reset);

@@ -1054,14 +1007,11 @@ impl LayerInner {
                Ok(())
            }
            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
-                tracing::error!(
-                    layer_size = %self.desc.file_size,
-                    "failed to evict layer from disk, it was already gone (metrics will be inaccurate)"
-                );
+                tracing::info!("failed to evict file from disk, it was already gone");
                Err(EvictionCancelled::FileNotFound)
            }
            Err(e) => {
-                tracing::error!("failed to evict file from disk: {e:#}");
+                tracing::warn!("failed to evict file from disk: {e:#}");
                Err(EvictionCancelled::RemoveFailed)
            }
        };
@@ -1105,8 +1055,6 @@ enum DownloadError {
    ContextAndConfigReallyDeniesDownloads,
    #[error("downloading is really required but not allowed by this method")]
    DownloadRequired,
-    #[error("layer path exists, but it is not a file: {0:?}")]
-    NotFile(std::fs::FileType),
    /// Why no error here? Because it will be reported by page_service. We should had also done
    /// retries already.
    #[error("downloading evicted layer file failed")]
@@ -1122,7 +1070,7 @@ enum DownloadError {
 #[derive(Debug, PartialEq)]
 pub(crate) enum NeedsDownload {
    NotFound,
-    NotFile(std::fs::FileType),
+    NotFile,
    WrongSize { actual: u64, expected: u64 },
 }

@@ -1130,7 +1078,7 @@ impl std::fmt::Display for NeedsDownload {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            NeedsDownload::NotFound => write!(f, "file was not found"),
-            NeedsDownload::NotFile(ft) => write!(f, "path is not a file; {ft:?}"),
+            NeedsDownload::NotFile => write!(f, "path is not a file"),
            NeedsDownload::WrongSize { actual, expected } => {
                write!(f, "file size mismatch {actual} vs. {expected}")
            }
@@ -1141,8 +1089,6 @@ impl std::fmt::Display for NeedsDownload {
 /// Existence of `DownloadedLayer` means that we have the file locally, and can later evict it.
 pub(crate) struct DownloadedLayer {
    owner: Weak<LayerInner>,
-    // Use tokio OnceCell as we do not need to deinitialize this, it'll just get dropped with the
-    // DownloadedLayer
    kind: tokio::sync::OnceCell<anyhow::Result<LayerKind>>,
    version: usize,
 }
@@ -1186,6 +1132,7 @@ impl DownloadedLayer {
                "these are the same, just avoiding the upgrade"
            );

+            // there is nothing async here, but it should be async
            let res = if owner.desc.is_delta {
                let summary = Some(delta_layer::Summary::expected(
                    owner.desc.tenant_id,
@@ -1284,8 +1231,6 @@ impl std::fmt::Debug for ResidentLayer {

 impl ResidentLayer {
    /// Release the eviction guard, converting back into a plain [`Layer`].
-    ///
-    /// You can access the [`Layer`] also by using `as_ref`.
    pub(crate) fn drop_eviction_guard(self) -> Layer {
        self.into()
    }
@@ -1341,7 +1286,7 @@ impl AsRef<Layer> for ResidentLayer {
    }
 }

-/// Drop the eviction guard.
+/// Allow slimming down if we don't want the `2*usize` with eviction candidates?
 impl From<ResidentLayer> for Layer {
    fn from(value: ResidentLayer) -> Self {
        value.owner
@@ -1511,13 +1456,6 @@ impl LayerImplMetrics {
            .unwrap()
            .inc();
    }
-
-    fn inc_broadcast_lagged(&self) {
-        self.rare_counters
-            .get_metric_with_label_values(&["broadcast_lagged"])
-            .unwrap()
-            .inc();
-    }
 }

 enum EvictionCancelled {
@@ -1529,8 +1467,6 @@ enum EvictionCancelled {
    AlreadyReinitialized,
    /// Not evicted because of a pending reinitialization
    LostToDownload,
-    /// After eviction, there was a new layer access which cancelled the eviction.
-    UpgradedBackOnAccess,
 }

 impl EvictionCancelled {
@@ -1543,7 +1479,6 @@ impl EvictionCancelled {
            EvictionCancelled::RemoveFailed => "remove_failed",
            EvictionCancelled::AlreadyReinitialized => "already_reinitialized",
            EvictionCancelled::LostToDownload => "lost_to_download",
-            EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access",
        }
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2936,10 +2936,13 @@ struct CompactLevel0Phase1StatsBuilder {
    new_deltas_size: Option<u64>,
 }

+#[serde_as]
 #[derive(serde::Serialize)]
 struct CompactLevel0Phase1Stats {
    version: u64,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    tenant_id: TenantId,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    timeline_id: TimelineId,
    read_lock_acquisition_micros: RecordedDuration,
    read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -19,7 +19,6 @@ use std::io::{Error, ErrorKind, Seek, SeekFrom};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{RwLock, RwLockWriteGuard};
-use utils::fs_ext;

 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
@@ -174,78 +173,37 @@ impl OpenFiles {
    }
 }

-/// Identify error types that should alwways terminate the process.  Other
-/// error types may be elegible for retry.
-pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool {
-    use nix::errno::Errno::*;
-    match e.raw_os_error().map(nix::errno::from_i32) {
-        Some(EIO) => {
-            // Terminate on EIO because we no longer trust the device to store
-            // data safely, or to uphold persistence guarantees on fsync.
-            true
-        }
-        Some(EROFS) => {
-            // Terminate on EROFS because a filesystem is usually remounted
-            // readonly when it has experienced some critical issue, so the same
-            // logic as EIO applies.
-            true
-        }
-        Some(EACCES) => {
-            // Terminate on EACCESS because we should always have permissions
-            // for our own data dir: if we don't, then we can't do our job and
-            // need administrative intervention to fix permissions.  Terminating
-            // is the best way to make sure we stop cleanly rather than going
-            // into infinite retry loops, and will make it clear to the outside
-            // world that we need help.
-            true
-        }
-        _ => {
-            // Treat all other local file I/O errors are retryable.  This includes:
-            // - ENOSPC: we stay up and wait for eviction to free some space
-            // - EINVAL, EBADF, EBADFD: this is a code bug, not a filesystem/hardware issue
-            // - WriteZero, Interrupted: these are used internally VirtualFile
-            false
-        }
-    }
+#[derive(Debug, thiserror::Error)]
+pub enum CrashsafeOverwriteError {
+    #[error("final path has no parent dir")]
+    FinalPathHasNoParentDir,
+    #[error("remove tempfile")]
+    RemovePreviousTempfile(#[source] std::io::Error),
+    #[error("create tempfile")]
+    CreateTempfile(#[source] std::io::Error),
+    #[error("write tempfile")]
+    WriteContents(#[source] std::io::Error),
+    #[error("sync tempfile")]
+    SyncTempfile(#[source] std::io::Error),
+    #[error("rename tempfile to final path")]
+    RenameTempfileToFinalPath(#[source] std::io::Error),
+    #[error("open final path parent dir")]
+    OpenFinalPathParentDir(#[source] std::io::Error),
+    #[error("sync final path parent dir")]
+    SyncFinalPathParentDir(#[source] std::io::Error),
 }
-
-/// Call this when the local filesystem gives us an error with an external
-/// cause: this includes EIO, EROFS, and EACCESS: all these indicate either
-/// bad storage or bad configuration, and we can't fix that from inside
-/// a running process.
-pub(crate) fn on_fatal_io_error(e: &std::io::Error, context: &str) -> ! {
-    tracing::error!("Fatal I/O error: {e}: {context})");
-    std::process::abort();
-}
-
-pub(crate) trait MaybeFatalIo<T> {
-    fn maybe_fatal_err(self, context: &str) -> std::io::Result<T>;
-    fn fatal_err(self, context: &str) -> T;
-}
-
-impl<T> MaybeFatalIo<T> for std::io::Result<T> {
-    /// Terminate the process if the result is an error of a fatal type, else pass it through
-    ///
-    /// This is appropriate for writes, where we typically want to die on EIO/ACCES etc, but
-    /// not on ENOSPC.
-    fn maybe_fatal_err(self, context: &str) -> std::io::Result<T> {
-        if let Err(e) = &self {
-            if is_fatal_io_error(e) {
-                on_fatal_io_error(e, context);
-            }
-        }
-        self
-    }
-
-    /// Terminate the process on any I/O error.
-    ///
-    /// This is appropriate for reads on files that we know exist: they should always work.
-    fn fatal_err(self, context: &str) -> T {
+impl CrashsafeOverwriteError {
+    /// Returns true iff the new contents are durably stored.
+    pub fn are_new_contents_durable(&self) -> bool {
        match self {
-            Ok(v) => v,
-            Err(e) => {
-                on_fatal_io_error(&e, context);
-            }
+            Self::FinalPathHasNoParentDir => false,
+            Self::RemovePreviousTempfile(_) => false,
+            Self::CreateTempfile(_) => false,
+            Self::WriteContents(_) => false,
+            Self::SyncTempfile(_) => false,
+            Self::RenameTempfileToFinalPath(_) => false,
+            Self::OpenFinalPathParentDir(_) => false,
+            Self::SyncFinalPathParentDir(_) => true,
        }
    }
 }
@@ -326,13 +284,15 @@ impl VirtualFile {
        final_path: &Utf8Path,
        tmp_path: &Utf8Path,
        content: &[u8],
-    ) -> std::io::Result<()> {
+    ) -> Result<(), CrashsafeOverwriteError> {
        let Some(final_path_parent) = final_path.parent() else {
-            return Err(std::io::Error::from_raw_os_error(
-                nix::errno::Errno::EINVAL as i32,
-            ));
+            return Err(CrashsafeOverwriteError::FinalPathHasNoParentDir);
        };
-        std::fs::remove_file(tmp_path).or_else(fs_ext::ignore_not_found)?;
+        match std::fs::remove_file(tmp_path) {
+            Ok(()) => {}
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
+            Err(e) => return Err(CrashsafeOverwriteError::RemovePreviousTempfile(e)),
+        }
        let mut file = Self::open_with_options(
            tmp_path,
            OpenOptions::new()
@@ -341,20 +301,31 @@ impl VirtualFile {
                // we bail out instead of causing damage.
                .create_new(true),
        )
-        .await?;
-        file.write_all(content).await?;
-        file.sync_all().await?;
+        .await
+        .map_err(CrashsafeOverwriteError::CreateTempfile)?;
+        file.write_all(content)
+            .await
+            .map_err(CrashsafeOverwriteError::WriteContents)?;
+        file.sync_all()
+            .await
+            .map_err(CrashsafeOverwriteError::SyncTempfile)?;
        drop(file); // before the rename, that's important!
                    // renames are atomic
-        std::fs::rename(tmp_path, final_path)?;
+        std::fs::rename(tmp_path, final_path)
+            .map_err(CrashsafeOverwriteError::RenameTempfileToFinalPath)?;
        // Only open final path parent dirfd now, so that this operation only
        // ever holds one VirtualFile fd at a time.  That's important because
        // the current `find_victim_slot` impl might pick the same slot for both
        // VirtualFile., and it eventually does a blocking write lock instead of
        // try_lock.
        let final_parent_dirfd =
-            Self::open_with_options(final_path_parent, OpenOptions::new().read(true)).await?;
-        final_parent_dirfd.sync_all().await?;
+            Self::open_with_options(final_path_parent, OpenOptions::new().read(true))
+                .await
+                .map_err(CrashsafeOverwriteError::OpenFinalPathParentDir)?;
+        final_parent_dirfd
+            .sync_all()
+            .await
+            .map_err(CrashsafeOverwriteError::SyncFinalPathParentDir)?;
        Ok(())
    }

--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -88,7 +88,7 @@ static void StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd);
 static void WalSndLoop(WalProposer *wp);
 static void XLogBroadcastWalProposer(WalProposer *wp);

-static void XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr);
+static void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr);
 static void XLogWalPropClose(XLogRecPtr recptr);

 static void
@@ -1241,7 +1241,7 @@ WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XL
 				rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;

 				/* write WAL to disk */
-				XLogWalPropWrite(sk->wp, &buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn);
+				XLogWalPropWrite(&buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn);

 				ereport(DEBUG1,
 						(errmsg("Recover message %X/%X length %d",
@@ -1283,24 +1283,11 @@ static XLogSegNo walpropSegNo = 0;
 * Write XLOG data to disk.
 */
 static void
-XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr)
+XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr)
 {
 	int			startoff;
 	int			byteswritten;

-	/*
-	 * Apart from walproposer, basebackup LSN page is also written out by
-	 * postgres itself which writes WAL only in pages, and in basebackup it is
-	 * inherently dummy (only safekeepers have historic WAL). Update WAL buffers
-	 * here to avoid dummy page overwriting correct one we download here. Ugly,
-	 * but alternatives are about the same ugly. We won't need that if we switch
-	 * to on-demand WAL download from safekeepers, without writing to disk.
-	 *
-	 * https://github.com/neondatabase/neon/issues/5749
-	 */
-	if (!wp->config->syncSafekeepers)
-		XLogUpdateWalBuffers(buf, recptr, nbytes);
-
 	while (nbytes > 0)
 	{
 		int			segbytes;
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -13,7 +13,6 @@ pub struct ConsoleError {
 #[derive(Deserialize)]
 pub struct GetRoleSecret {
    pub role_secret: Box<str>,
-    pub allowed_ips: Option<Vec<Box<str>>>,
 }

 // Manually implement debug to omit sensitive info.
@@ -188,31 +187,4 @@ mod tests {

        Ok(())
    }
-
-    #[test]
-    fn parse_wake_compute() -> anyhow::Result<()> {
-        let json = json!({
-            "address": "0.0.0.0",
-            "aux": dummy_aux(),
-        });
-        let _: WakeCompute = serde_json::from_str(&json.to_string())?;
-        Ok(())
-    }
-
-    #[test]
-    fn parse_get_role_secret() -> anyhow::Result<()> {
-        // Empty `allowed_ips` field.
-        let json = json!({
-            "role_secret": "secret",
-        });
-        let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
-        // Empty `allowed_ips` field.
-        let json = json!({
-            "role_secret": "secret",
-            "allowed_ips": ["8.8.8.8"],
-        });
-        let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
-
-        Ok(())
-    }
 }
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -470,26 +470,30 @@ async fn query_to_json<T: GenericClient>(
    }
    .and_then(|s| s.parse::<i64>().ok());

-    let mut fields = vec![];
-    let mut columns = vec![];
-
-    for c in row_stream.columns() {
-        fields.push(json!({
-            "name": Value::String(c.name().to_owned()),
-            "dataTypeID": Value::Number(c.type_().oid().into()),
-            "tableID": c.table_oid(),
-            "columnID": c.column_id(),
-            "dataTypeSize": c.type_size(),
-            "dataTypeModifier": c.type_modifier(),
-            "format": "text",
-        }));
-        columns.push(client.get_type(c.type_oid()).await?);
-    }
+    let fields = if !rows.is_empty() {
+        rows[0]
+            .columns()
+            .iter()
+            .map(|c| {
+                json!({
+                    "name": Value::String(c.name().to_owned()),
+                    "dataTypeID": Value::Number(c.type_().oid().into()),
+                    "tableID": c.table_oid(),
+                    "columnID": c.column_id(),
+                    "dataTypeSize": c.type_size(),
+                    "dataTypeModifier": c.type_modifier(),
+                    "format": "text",
+                })
+            })
+            .collect::<Vec<_>>()
+    } else {
+        Vec::new()
+    };

    // convert rows to JSON
    let rows = rows
        .iter()
-        .map(|row| pg_text_row_to_json(row, &columns, raw_output, array_mode))
+        .map(|row| pg_text_row_to_json(row, raw_output, array_mode))
        .collect::<Result<Vec<_>, _>>()?;

    // resulting JSON format is based on the format of node-postgres result
@@ -510,28 +514,22 @@ async fn query_to_json<T: GenericClient>(
 //
 pub fn pg_text_row_to_json(
    row: &Row,
-    columns: &[Type],
    raw_output: bool,
    array_mode: bool,
 ) -> Result<Value, anyhow::Error> {
-    let iter = row
-        .columns()
-        .iter()
-        .zip(columns)
-        .enumerate()
-        .map(|(i, (column, typ))| {
-            let name = column.name();
-            let pg_value = row.as_text(i)?;
-            let json_value = if raw_output {
-                match pg_value {
-                    Some(v) => Value::String(v.to_string()),
-                    None => Value::Null,
-                }
-            } else {
-                pg_text_to_json(pg_value, typ)?
-            };
-            Ok((name.to_string(), json_value))
-        });
+    let iter = row.columns().iter().enumerate().map(|(i, column)| {
+        let name = column.name();
+        let pg_value = row.as_text(i)?;
+        let json_value = if raw_output {
+            match pg_value {
+                Some(v) => Value::String(v.to_string()),
+                None => Value::Null,
+            }
+        } else {
+            pg_text_to_json(pg_value, column.type_())?
+        };
+        Ok((name.to_string(), json_value))
+    });

    if array_mode {
        // drop keys and aggregate into array
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -33,7 +33,6 @@ reqwest = { workspace = true, default-features = false, features = ["rustls-tls"
 aws-config = { workspace = true, default-features = false, features = ["rustls", "credentials-sso"] }

 pageserver = { path = "../pageserver" }
-remote_storage = { path = "../libs/remote_storage" }

 tracing.workspace = true
 tracing-subscriber.workspace = true
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -1,18 +1,13 @@
 use std::collections::HashSet;

 use anyhow::Context;
-use aws_sdk_s3::{types::ObjectIdentifier, Client};
+use aws_sdk_s3::Client;
 use tracing::{error, info, warn};
-use utils::generation::Generation;

 use crate::cloud_admin_api::BranchData;
-use crate::metadata_stream::stream_listing;
-use crate::{download_object_with_retries, RootTarget};
-use futures_util::{pin_mut, StreamExt};
-use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
+use crate::{download_object_with_retries, list_objects_with_retries, RootTarget};
 use pageserver::tenant::storage_layer::LayerFileName;
 use pageserver::tenant::IndexPart;
-use remote_storage::RemotePath;
 use utils::id::TenantTimelineId;

 pub(crate) struct TimelineAnalysis {
@@ -73,7 +68,6 @@ pub(crate) async fn branch_cleanup_and_check_errors(
            match s3_data.blob_data {
                BlobDataParseResult::Parsed {
                    index_part,
-                    index_part_generation,
                    mut s3_layers,
                } => {
                    if !IndexPart::KNOWN_VERSIONS.contains(&index_part.get_version()) {
@@ -113,62 +107,33 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                                        ))
                        }

-                        let layer_map_key = (layer, metadata.generation);
-                        if !s3_layers.remove(&layer_map_key) {
-                            // FIXME: this will emit false positives if an index was
-                            // uploaded concurrently with our scan.  To make this check
-                            // correct, we need to try sending a HEAD request for the
-                            // layer we think is missing.
+                        if !s3_layers.remove(&layer) {
                            result.errors.push(format!(
-                                "index_part.json contains a layer {}{} that is not present in remote storage",
-                                layer_map_key.0.file_name(),
-                                layer_map_key.1.get_suffix()
+                                "index_part.json contains a layer {} that is not present in S3",
+                                layer.file_name(),
                            ))
                        }
                    }

-                    let orphan_layers: Vec<(LayerFileName, Generation)> = s3_layers
-                        .into_iter()
-                        .filter(|(_layer_name, gen)|
-                            // A layer is only considered orphaned if it has a generation below
-                            // the index.  If the generation is >= the index, then the layer may
-                            // be an upload from a running pageserver, or even an upload from
-                            // a new generation that didn't upload an index yet.
-                            //
-                            // Even so, a layer that is not referenced by the index could just
-                            // be something enqueued for deletion, so while this check is valid 
-                            // for indicating that a layer is garbage, it is not an indicator
-                            // of a problem.
-                            gen < &index_part_generation)
-                        .collect();
-
-                    if !orphan_layers.is_empty() {
+                    if !s3_layers.is_empty() {
                        result.errors.push(format!(
                            "index_part.json does not contain layers from S3: {:?}",
-                            orphan_layers
+                            s3_layers
                                .iter()
-                                .map(|(layer_name, gen)| format!(
-                                    "{}{}",
-                                    layer_name.file_name(),
-                                    gen.get_suffix()
-                                ))
+                                .map(|layer_name| layer_name.file_name())
                                .collect::<Vec<_>>(),
                        ));
-                        result.garbage_keys.extend(orphan_layers.iter().map(
-                            |(layer_name, layer_gen)| {
+                        result
+                            .garbage_keys
+                            .extend(s3_layers.iter().map(|layer_name| {
                                let mut key = s3_root.timeline_root(id).prefix_in_bucket;
                                let delimiter = s3_root.delimiter();
                                if !key.ends_with(delimiter) {
                                    key.push_str(delimiter);
                                }
-                                key.push_str(&format!(
-                                    "{}{}",
-                                    &layer_name.file_name(),
-                                    layer_gen.get_suffix()
-                                ));
+                                key.push_str(&layer_name.file_name());
                                key
-                            },
-                        ));
+                            }));
                    }
                }
                BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend(
@@ -213,96 +178,69 @@ pub(crate) struct S3TimelineBlobData {
 pub(crate) enum BlobDataParseResult {
    Parsed {
        index_part: IndexPart,
-        index_part_generation: Generation,
-        s3_layers: HashSet<(LayerFileName, Generation)>,
+        s3_layers: HashSet<LayerFileName>,
    },
    Incorrect(Vec<String>),
 }

-fn parse_layer_object_name(name: &str) -> Result<(LayerFileName, Generation), String> {
-    match name.rsplit_once('-') {
-        // FIXME: this is gross, just use a regex?
-        Some((layer_filename, gen)) if gen.len() == 8 => {
-            let layer = layer_filename.parse::<LayerFileName>()?;
-            let gen =
-                Generation::parse_suffix(gen).ok_or("Malformed generation suffix".to_string())?;
-            Ok((layer, gen))
-        }
-        _ => Ok((name.parse::<LayerFileName>()?, Generation::none())),
-    }
-}
-
 pub(crate) async fn list_timeline_blobs(
    s3_client: &Client,
    id: TenantTimelineId,
    s3_root: &RootTarget,
 ) -> anyhow::Result<S3TimelineBlobData> {
    let mut s3_layers = HashSet::new();
+    let mut index_part_object = None;
+
+    let timeline_dir_target = s3_root.timeline_root(&id);
+    let mut continuation_token = None;

    let mut errors = Vec::new();
    let mut keys_to_remove = Vec::new();

-    let mut timeline_dir_target = s3_root.timeline_root(&id);
-    timeline_dir_target.delimiter = String::new();
+    loop {
+        let fetch_response =
+            list_objects_with_retries(s3_client, &timeline_dir_target, continuation_token.clone())
+                .await?;

-    let mut index_parts: Vec<ObjectIdentifier> = Vec::new();
+        let subdirectories = fetch_response.common_prefixes().unwrap_or_default();
+        if !subdirectories.is_empty() {
+            errors.push(format!(
+                "S3 list response should not contain any subdirectories, but got {subdirectories:?}"
+            ));
+        }

-    let stream = stream_listing(s3_client, &timeline_dir_target);
-    pin_mut!(stream);
-    while let Some(obj) = stream.next().await {
-        let obj = obj?;
-        let key = match obj.key() {
-            Some(k) => k,
-            None => continue,
-        };
-
-        let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket);
-        match blob_name {
-            Some(name) if name.starts_with("index_part.json") => {
-                tracing::info!("Index key {key}");
-                index_parts.push(obj)
-            }
-            Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) {
-                Ok((new_layer, gen)) => {
-                    tracing::info!("Parsed layer key: {} {:?}", new_layer, gen);
-                    s3_layers.insert((new_layer, gen));
-                }
-                Err(e) => {
-                    tracing::info!("Error parsing key {maybe_layer_name}");
-                    errors.push(
-                        format!("S3 list response got an object with key {key} that is not a layer name: {e}"),
-                    );
+        for (object, key) in fetch_response
+            .contents()
+            .unwrap_or_default()
+            .iter()
+            .filter_map(|object| Some((object, object.key()?)))
+        {
+            let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket);
+            match blob_name {
+                Some("index_part.json") => index_part_object = Some(object.clone()),
+                Some(maybe_layer_name) => match maybe_layer_name.parse::<LayerFileName>() {
+                    Ok(new_layer) => {
+                        s3_layers.insert(new_layer);
+                    }
+                    Err(e) => {
+                        errors.push(
+                            format!("S3 list response got an object with key {key} that is not a layer name: {e}"),
+                        );
+                        keys_to_remove.push(key.to_string());
+                    }
+                },
+                None => {
+                    errors.push(format!("S3 list response got an object with odd key {key}"));
                    keys_to_remove.push(key.to_string());
                }
-            },
-            None => {
-                tracing::info!("Peculiar key {}", key);
-                errors.push(format!("S3 list response got an object with odd key {key}"));
-                keys_to_remove.push(key.to_string());
            }
        }
-    }

-    // Choose the index_part with the highest generation
-    let (index_part_object, index_part_generation) = match index_parts
-        .iter()
-        .filter_map(|k| {
-            let key = k.key().unwrap();
-            // Stripping the index key to the last part, because RemotePath doesn't
-            // like absolute paths, and depending on prefix_in_bucket it's possible
-            // for the keys we read back to start with a slash.
-            let basename = key.rsplit_once('/').unwrap().1;
-            parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (k, g))
-        })
-        .max_by_key(|i| i.1)
-        .map(|(k, g)| (k.clone(), g))
-    {
-        Some((key, gen)) => (Some(key), gen),
-        None => {
-            // Legacy/missing case: one or zero index parts, which did not have a generation
-            (index_parts.pop(), Generation::none())
+        match fetch_response.next_continuation_token {
+            Some(new_token) => continuation_token = Some(new_token),
+            None => break,
        }
-    };
+    }

    if index_part_object.is_none() {
        errors.push("S3 list response got no index_part.json file".to_string());
@@ -323,7 +261,6 @@ pub(crate) async fn list_timeline_blobs(
                return Ok(S3TimelineBlobData {
                    blob_data: BlobDataParseResult::Parsed {
                        index_part,
-                        index_part_generation,
                        s3_layers,
                    },
                    keys_to_remove,
--- a/s3_scrubber/src/cloud_admin_api.rs
+++ b/s3_scrubber/src/cloud_admin_api.rs
@@ -5,7 +5,6 @@ use std::time::Duration;

 use chrono::{DateTime, Utc};
 use hex::FromHex;
-use pageserver::tenant::Tenant;
 use reqwest::{header, Client, StatusCode, Url};
 use serde::Deserialize;
 use tokio::sync::Semaphore;
@@ -119,18 +118,13 @@ fn from_nullable_id<'de, D>(deserializer: D) -> Result<TenantId, D::Error>
 where
    D: serde::de::Deserializer<'de>,
 {
-    if deserializer.is_human_readable() {
-        let id_str = String::deserialize(deserializer)?;
-        if id_str.is_empty() {
-            // This is a bogus value, but for the purposes of the scrubber all that
-            // matters is that it doesn't collide with any real IDs.
-            Ok(TenantId::from([0u8; 16]))
-        } else {
-            TenantId::from_hex(&id_str).map_err(|e| serde::de::Error::custom(format!("{e}")))
-        }
+    let id_str = String::deserialize(deserializer)?;
+    if id_str.is_empty() {
+        // This is a bogus value, but for the purposes of the scrubber all that
+        // matters is that it doesn't collide with any real IDs.
+        Ok(TenantId::from([0u8; 16]))
    } else {
-        let id_arr = <[u8; 16]>::deserialize(deserializer)?;
-        Ok(TenantId::from(id_arr))
+        TenantId::from_hex(&id_str).map_err(|e| serde::de::Error::custom(format!("{e}")))
    }
 }

@@ -159,6 +153,7 @@ pub struct ProjectData {
    pub maintenance_set: Option<String>,
 }

+#[serde_with::serde_as]
 #[derive(Debug, serde::Deserialize)]
 pub struct BranchData {
    pub id: BranchId,
@@ -166,10 +161,12 @@ pub struct BranchData {
    pub updated_at: DateTime<Utc>,
    pub name: String,
    pub project_id: ProjectId,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub timeline_id: TimelineId,
    #[serde(default)]
    pub parent_id: Option<BranchId>,
    #[serde(default)]
+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
    pub parent_lsn: Option<Lsn>,
    pub default: bool,
    pub deleted: bool,
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -34,9 +34,6 @@ const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN";
 #[derive(Debug, Clone)]
 pub struct S3Target {
    pub bucket_name: String,
-    /// This `prefix_in_bucket` is only equal to the PS/SK config of the same
-    /// name for the RootTarget: other instances of S3Target will have prefix_in_bucket
-    /// with extra parts.
    pub prefix_in_bucket: String,
    pub delimiter: String,
 }
@@ -80,13 +77,9 @@ impl Display for NodeKind {
 impl S3Target {
    pub fn with_sub_segment(&self, new_segment: &str) -> Self {
        let mut new_self = self.clone();
-        if new_self.prefix_in_bucket.is_empty() {
-            new_self.prefix_in_bucket = format!("/{}/", new_segment);
-        } else {
-            let _ = new_self.prefix_in_bucket.pop();
-            new_self.prefix_in_bucket =
-                [&new_self.prefix_in_bucket, new_segment, ""].join(&new_self.delimiter);
-        }
+        let _ = new_self.prefix_in_bucket.pop();
+        new_self.prefix_in_bucket =
+            [&new_self.prefix_in_bucket, new_segment, ""].join(&new_self.delimiter);
        new_self
    }
 }
@@ -98,10 +91,10 @@ pub enum RootTarget {
 }

 impl RootTarget {
-    pub fn tenants_root(&self) -> S3Target {
+    pub fn tenants_root(&self) -> &S3Target {
        match self {
-            Self::Pageserver(root) => root.with_sub_segment(TENANTS_SEGMENT_NAME),
-            Self::Safekeeper(root) => root.with_sub_segment("wal"),
+            Self::Pageserver(root) => root,
+            Self::Safekeeper(root) => root,
        }
    }

@@ -140,7 +133,6 @@ impl RootTarget {
 pub struct BucketConfig {
    pub region: String,
    pub bucket: String,
-    pub prefix_in_bucket: Option<String>,

    /// Use SSO if this is set, else rely on AWS_* environment vars
    pub sso_account_id: Option<String>,
@@ -163,12 +155,10 @@ impl BucketConfig {
        let sso_account_id = env::var("SSO_ACCOUNT_ID").ok();
        let region = env::var("REGION").context("'REGION' param retrieval")?;
        let bucket = env::var("BUCKET").context("'BUCKET' param retrieval")?;
-        let prefix_in_bucket = env::var("BUCKET_PREFIX").ok();

        Ok(Self {
            region,
            bucket,
-            prefix_in_bucket,
            sso_account_id,
        })
    }
@@ -201,14 +191,14 @@ pub fn init_logging(file_name: &str) -> WorkerGuard {
        .with_target(false)
        .with_ansi(false)
        .with_writer(file_writer);
-    let stderr_logs = fmt::Layer::new()
-        .with_ansi(std::io::stderr().is_terminal())
+    let stdout_logs = fmt::Layer::new()
+        .with_ansi(std::io::stdout().is_terminal())
        .with_target(false)
-        .with_writer(std::io::stderr);
+        .with_writer(std::io::stdout);
    tracing_subscriber::registry()
        .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")))
        .with(file_logs)
-        .with(stderr_logs)
+        .with(stdout_logs)
        .init();

    guard
@@ -260,20 +250,15 @@ fn init_remote(
    let bucket_region = Region::new(bucket_config.region);
    let delimiter = "/".to_string();
    let s3_client = Arc::new(init_s3_client(bucket_config.sso_account_id, bucket_region));
-
    let s3_root = match node_kind {
        NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
            bucket_name: bucket_config.bucket,
-            prefix_in_bucket: bucket_config
-                .prefix_in_bucket
-                .unwrap_or("pageserver/v1".to_string()),
+            prefix_in_bucket: ["pageserver", "v1", TENANTS_SEGMENT_NAME, ""].join(&delimiter),
            delimiter,
        }),
        NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
            bucket_name: bucket_config.bucket,
-            prefix_in_bucket: bucket_config
-                .prefix_in_bucket
-                .unwrap_or("safekeeper/v1".to_string()),
+            prefix_in_bucket: ["safekeeper", "v1", "wal", ""].join(&delimiter),
            delimiter,
        }),
    };
--- a/s3_scrubber/src/main.rs
+++ b/s3_scrubber/src/main.rs
@@ -31,10 +31,7 @@ enum Command {
        #[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
        mode: PurgeMode,
    },
-    ScanMetadata {
-        #[arg(short, long, default_value_t = false)]
-        json: bool,
-    },
+    ScanMetadata {},
 }

 #[tokio::main]
@@ -57,17 +54,13 @@ async fn main() -> anyhow::Result<()> {
    ));

    match cli.command {
-        Command::ScanMetadata { json } => match scan_metadata(bucket_config).await {
+        Command::ScanMetadata {} => match scan_metadata(bucket_config).await {
            Err(e) => {
                tracing::error!("Failed: {e}");
                Err(e)
            }
            Ok(summary) => {
-                if json {
-                    println!("{}", serde_json::to_string(&summary).unwrap())
-                } else {
-                    println!("{}", summary.summary_string());
-                }
+                println!("{}", summary.summary_string());
                if summary.is_fatal() {
                    Err(anyhow::anyhow!("Fatal scrub errors detected"))
                } else {
--- a/s3_scrubber/src/metadata_stream.rs
+++ b/s3_scrubber/src/metadata_stream.rs
@@ -13,10 +13,10 @@ pub fn stream_tenants<'a>(
 ) -> impl Stream<Item = anyhow::Result<TenantId>> + 'a {
    try_stream! {
        let mut continuation_token = None;
-        let tenants_target = target.tenants_root();
        loop {
+            let tenants_target = target.tenants_root();
            let fetch_response =
-                list_objects_with_retries(s3_client, &tenants_target, continuation_token.clone()).await?;
+                list_objects_with_retries(s3_client, tenants_target, continuation_token.clone()).await?;

            let new_entry_ids = fetch_response
                .common_prefixes()
--- a/s3_scrubber/src/scan_metadata.rs
+++ b/s3_scrubber/src/scan_metadata.rs
@@ -10,10 +10,8 @@ use aws_sdk_s3::Client;
 use futures_util::{pin_mut, StreamExt, TryStreamExt};
 use histogram::Histogram;
 use pageserver::tenant::IndexPart;
-use serde::Serialize;
 use utils::id::TenantTimelineId;

-#[derive(Serialize)]
 pub struct MetadataSummary {
    count: usize,
    with_errors: HashSet<TenantTimelineId>,
@@ -27,9 +25,7 @@ pub struct MetadataSummary {
 }

 /// A histogram plus minimum and maximum tracking
-#[derive(Serialize)]
 struct MinMaxHisto {
-    #[serde(skip)]
    histo: Histogram,
    min: u64,
    max: u64,
@@ -113,7 +109,6 @@ impl MetadataSummary {
        self.count += 1;
        if let BlobDataParseResult::Parsed {
            index_part,
-            index_part_generation: _,
            s3_layers: _,
        } = &data.blob_data
        {
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -47,7 +47,6 @@ pq_proto.workspace = true
 remote_storage.workspace = true
 safekeeper_api.workspace = true
 storage_broker.workspace = true
-tokio-stream.workspace = true
 utils.workspace = true

 workspace_hack.workspace = true
--- a/safekeeper/src/control_file_upgrade.rs
+++ b/safekeeper/src/control_file_upgrade.rs
@@ -13,7 +13,7 @@ use utils::{
 };

 /// Persistent consensus state of the acceptor.
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 struct AcceptorStateV1 {
    /// acceptor's last term it voted for (advanced in 1 phase)
    term: Term,
@@ -21,7 +21,7 @@ struct AcceptorStateV1 {
    epoch: Term,
 }

-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 struct SafeKeeperStateV1 {
    /// persistent acceptor state
    acceptor_state: AcceptorStateV1,
@@ -50,7 +50,7 @@ pub struct ServerInfoV2 {
    pub wal_seg_size: u32,
 }

-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct SafeKeeperStateV2 {
    /// persistent acceptor state
    pub acceptor_state: AcceptorState,
@@ -81,7 +81,7 @@ pub struct ServerInfoV3 {
    pub wal_seg_size: u32,
 }

-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct SafeKeeperStateV3 {
    /// persistent acceptor state
    pub acceptor_state: AcceptorState,
@@ -101,7 +101,7 @@ pub struct SafeKeeperStateV3 {
    pub wal_start_lsn: Lsn,
 }

-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct SafeKeeperStateV4 {
    #[serde(with = "hex")]
    pub tenant_id: TenantId,
@@ -264,245 +264,3 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
    }
    bail!("unsupported safekeeper control file version {}", version)
 }
-
-#[cfg(test)]
-mod tests {
-    use std::str::FromStr;
-
-    use utils::{id::NodeId, Hex};
-
-    use crate::safekeeper::PersistedPeerInfo;
-
-    use super::*;
-
-    #[test]
-    fn roundtrip_v1() {
-        let tenant_id = TenantId::from_str("cf0480929707ee75372337efaa5ecf96").unwrap();
-        let timeline_id = TimelineId::from_str("112ded66422aa5e953e5440fa5427ac4").unwrap();
-        let state = SafeKeeperStateV1 {
-            acceptor_state: AcceptorStateV1 {
-                term: 42,
-                epoch: 43,
-            },
-            server: ServerInfoV2 {
-                pg_version: 14,
-                system_id: 0x1234567887654321,
-                tenant_id,
-                timeline_id,
-                wal_seg_size: 0x12345678,
-            },
-            proposer_uuid: {
-                let mut arr = timeline_id.as_arr();
-                arr.reverse();
-                arr
-            },
-            commit_lsn: Lsn(1234567800),
-            truncate_lsn: Lsn(123456780),
-            wal_start_lsn: Lsn(1234567800 - 8),
-        };
-
-        let ser = state.ser().unwrap();
-        #[rustfmt::skip]
-        let expected = [
-            // term
-            0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            // epoch
-            0x2b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            // pg_version
-            0x0e, 0x00, 0x00, 0x00,
-            // system_id
-            0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12,
-            // tenant_id
-            0xcf, 0x04, 0x80, 0x92, 0x97, 0x07, 0xee, 0x75, 0x37, 0x23, 0x37, 0xef, 0xaa, 0x5e, 0xcf, 0x96,
-            // timeline_id
-            0x11, 0x2d, 0xed, 0x66, 0x42, 0x2a, 0xa5, 0xe9, 0x53, 0xe5, 0x44, 0x0f, 0xa5, 0x42, 0x7a, 0xc4,
-            // wal_seg_size
-            0x78, 0x56, 0x34, 0x12,
-            // proposer_uuid
-            0xc4, 0x7a, 0x42, 0xa5, 0x0f, 0x44, 0xe5, 0x53, 0xe9, 0xa5, 0x2a, 0x42, 0x66, 0xed, 0x2d, 0x11,
-            // commit_lsn
-            0x78, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
-            // truncate_lsn
-            0x0c, 0xcd, 0x5b, 0x07, 0x00, 0x00, 0x00, 0x00,
-            // wal_start_lsn
-            0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
-        ];
-
-        assert_eq!(Hex(&ser), Hex(&expected));
-
-        let deser = SafeKeeperStateV1::des(&ser).unwrap();
-
-        assert_eq!(state, deser);
-    }
-
-    #[test]
-    fn roundtrip_v2() {
-        let tenant_id = TenantId::from_str("cf0480929707ee75372337efaa5ecf96").unwrap();
-        let timeline_id = TimelineId::from_str("112ded66422aa5e953e5440fa5427ac4").unwrap();
-        let state = SafeKeeperStateV2 {
-            acceptor_state: AcceptorState {
-                term: 42,
-                term_history: TermHistory(vec![TermLsn {
-                    lsn: Lsn(0x1),
-                    term: 41,
-                }]),
-            },
-            server: ServerInfoV2 {
-                pg_version: 14,
-                system_id: 0x1234567887654321,
-                tenant_id,
-                timeline_id,
-                wal_seg_size: 0x12345678,
-            },
-            proposer_uuid: {
-                let mut arr = timeline_id.as_arr();
-                arr.reverse();
-                arr
-            },
-            commit_lsn: Lsn(1234567800),
-            truncate_lsn: Lsn(123456780),
-            wal_start_lsn: Lsn(1234567800 - 8),
-        };
-
-        let ser = state.ser().unwrap();
-        let expected = [
-            0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x21, 0x43, 0x65, 0x87, 0x78, 0x56,
-            0x34, 0x12, 0xcf, 0x04, 0x80, 0x92, 0x97, 0x07, 0xee, 0x75, 0x37, 0x23, 0x37, 0xef,
-            0xaa, 0x5e, 0xcf, 0x96, 0x11, 0x2d, 0xed, 0x66, 0x42, 0x2a, 0xa5, 0xe9, 0x53, 0xe5,
-            0x44, 0x0f, 0xa5, 0x42, 0x7a, 0xc4, 0x78, 0x56, 0x34, 0x12, 0xc4, 0x7a, 0x42, 0xa5,
-            0x0f, 0x44, 0xe5, 0x53, 0xe9, 0xa5, 0x2a, 0x42, 0x66, 0xed, 0x2d, 0x11, 0x78, 0x02,
-            0x96, 0x49, 0x00, 0x00, 0x00, 0x00, 0x0c, 0xcd, 0x5b, 0x07, 0x00, 0x00, 0x00, 0x00,
-            0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
-        ];
-
-        assert_eq!(Hex(&ser), Hex(&expected));
-
-        let deser = SafeKeeperStateV2::des(&ser).unwrap();
-
-        assert_eq!(state, deser);
-    }
-
-    #[test]
-    fn roundtrip_v3() {
-        let tenant_id = TenantId::from_str("cf0480929707ee75372337efaa5ecf96").unwrap();
-        let timeline_id = TimelineId::from_str("112ded66422aa5e953e5440fa5427ac4").unwrap();
-        let state = SafeKeeperStateV3 {
-            acceptor_state: AcceptorState {
-                term: 42,
-                term_history: TermHistory(vec![TermLsn {
-                    lsn: Lsn(0x1),
-                    term: 41,
-                }]),
-            },
-            server: ServerInfoV3 {
-                pg_version: 14,
-                system_id: 0x1234567887654321,
-                tenant_id,
-                timeline_id,
-                wal_seg_size: 0x12345678,
-            },
-            proposer_uuid: {
-                let mut arr = timeline_id.as_arr();
-                arr.reverse();
-                arr
-            },
-            commit_lsn: Lsn(1234567800),
-            truncate_lsn: Lsn(123456780),
-            wal_start_lsn: Lsn(1234567800 - 8),
-        };
-
-        let ser = state.ser().unwrap();
-        let expected = [
-            0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x21, 0x43, 0x65, 0x87, 0x78, 0x56,
-            0x34, 0x12, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x63, 0x66, 0x30, 0x34,
-            0x38, 0x30, 0x39, 0x32, 0x39, 0x37, 0x30, 0x37, 0x65, 0x65, 0x37, 0x35, 0x33, 0x37,
-            0x32, 0x33, 0x33, 0x37, 0x65, 0x66, 0x61, 0x61, 0x35, 0x65, 0x63, 0x66, 0x39, 0x36,
-            0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x31, 0x31, 0x32, 0x64, 0x65, 0x64,
-            0x36, 0x36, 0x34, 0x32, 0x32, 0x61, 0x61, 0x35, 0x65, 0x39, 0x35, 0x33, 0x65, 0x35,
-            0x34, 0x34, 0x30, 0x66, 0x61, 0x35, 0x34, 0x32, 0x37, 0x61, 0x63, 0x34, 0x78, 0x56,
-            0x34, 0x12, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x63, 0x34, 0x37, 0x61,
-            0x34, 0x32, 0x61, 0x35, 0x30, 0x66, 0x34, 0x34, 0x65, 0x35, 0x35, 0x33, 0x65, 0x39,
-            0x61, 0x35, 0x32, 0x61, 0x34, 0x32, 0x36, 0x36, 0x65, 0x64, 0x32, 0x64, 0x31, 0x31,
-            0x78, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, 0x0c, 0xcd, 0x5b, 0x07, 0x00, 0x00,
-            0x00, 0x00, 0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
-        ];
-
-        assert_eq!(Hex(&ser), Hex(&expected));
-
-        let deser = SafeKeeperStateV3::des(&ser).unwrap();
-
-        assert_eq!(state, deser);
-    }
-
-    #[test]
-    fn roundtrip_v4() {
-        let tenant_id = TenantId::from_str("cf0480929707ee75372337efaa5ecf96").unwrap();
-        let timeline_id = TimelineId::from_str("112ded66422aa5e953e5440fa5427ac4").unwrap();
-        let state = SafeKeeperStateV4 {
-            tenant_id,
-            timeline_id,
-            acceptor_state: AcceptorState {
-                term: 42,
-                term_history: TermHistory(vec![TermLsn {
-                    lsn: Lsn(0x1),
-                    term: 41,
-                }]),
-            },
-            server: ServerInfo {
-                pg_version: 14,
-                system_id: 0x1234567887654321,
-                wal_seg_size: 0x12345678,
-            },
-            proposer_uuid: {
-                let mut arr = timeline_id.as_arr();
-                arr.reverse();
-                arr
-            },
-            peers: PersistedPeers(vec![(
-                NodeId(1),
-                PersistedPeerInfo {
-                    backup_lsn: Lsn(1234567000),
-                    term: 42,
-                    flush_lsn: Lsn(1234567800 - 8),
-                    commit_lsn: Lsn(1234567600),
-                },
-            )]),
-            commit_lsn: Lsn(1234567800),
-            s3_wal_lsn: Lsn(1234567300),
-            peer_horizon_lsn: Lsn(9999999),
-            remote_consistent_lsn: Lsn(1234560000),
-        };
-
-        let ser = state.ser().unwrap();
-        let expected = [
-            0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x63, 0x66, 0x30, 0x34, 0x38, 0x30,
-            0x39, 0x32, 0x39, 0x37, 0x30, 0x37, 0x65, 0x65, 0x37, 0x35, 0x33, 0x37, 0x32, 0x33,
-            0x33, 0x37, 0x65, 0x66, 0x61, 0x61, 0x35, 0x65, 0x63, 0x66, 0x39, 0x36, 0x20, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x31, 0x31, 0x32, 0x64, 0x65, 0x64, 0x36, 0x36,
-            0x34, 0x32, 0x32, 0x61, 0x61, 0x35, 0x65, 0x39, 0x35, 0x33, 0x65, 0x35, 0x34, 0x34,
-            0x30, 0x66, 0x61, 0x35, 0x34, 0x32, 0x37, 0x61, 0x63, 0x34, 0x2a, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x29, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x0e, 0x00, 0x00, 0x00, 0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12, 0x78, 0x56,
-            0x34, 0x12, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x63, 0x34, 0x37, 0x61,
-            0x34, 0x32, 0x61, 0x35, 0x30, 0x66, 0x34, 0x34, 0x65, 0x35, 0x35, 0x33, 0x65, 0x39,
-            0x61, 0x35, 0x32, 0x61, 0x34, 0x32, 0x36, 0x36, 0x65, 0x64, 0x32, 0x64, 0x31, 0x31,
-            0x78, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x96, 0x49, 0x00, 0x00,
-            0x00, 0x00, 0x7f, 0x96, 0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe4, 0x95, 0x49,
-            0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x58, 0xff, 0x95, 0x49, 0x00, 0x00, 0x00, 0x00,
-            0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x70, 0x02, 0x96, 0x49, 0x00, 0x00,
-            0x00, 0x00, 0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
-        ];
-
-        assert_eq!(Hex(&ser), Hex(&expected));
-
-        let deser = SafeKeeperStateV4::des(&ser).unwrap();
-
-        assert_eq!(state, deser);
-    }
-}
--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -5,7 +5,6 @@ use std::fs::DirEntry;
 use std::io::BufReader;
 use std::io::Read;
 use std::path::PathBuf;
-use std::sync::Arc;

 use anyhow::Result;
 use camino::Utf8Path;
@@ -14,6 +13,7 @@ use postgres_ffi::XLogSegNo;
 use serde::Deserialize;
 use serde::Serialize;

+use serde_with::{serde_as, DisplayFromStr};
 use utils::id::NodeId;
 use utils::id::TenantTimelineId;
 use utils::id::{TenantId, TimelineId};
@@ -28,7 +28,7 @@ use crate::send_wal::WalSenderState;
 use crate::GlobalTimelines;

 /// Various filters that influence the resulting JSON output.
-#[derive(Debug, Serialize, Deserialize, Clone)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct Args {
    /// Dump all available safekeeper state. False by default.
    pub dump_all: bool,
@@ -53,76 +53,15 @@ pub struct Args {
 }

 /// Response for debug dump request.
-#[derive(Debug, Serialize)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct Response {
    pub start_time: DateTime<Utc>,
    pub finish_time: DateTime<Utc>,
-    pub timelines: Vec<TimelineDumpSer>,
+    pub timelines: Vec<Timeline>,
    pub timelines_count: usize,
    pub config: Config,
 }

-pub struct TimelineDumpSer {
-    pub tli: Arc<crate::timeline::Timeline>,
-    pub args: Args,
-    pub runtime: Arc<tokio::runtime::Runtime>,
-}
-
-impl std::fmt::Debug for TimelineDumpSer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("TimelineDumpSer")
-            .field("tli", &self.tli.ttid)
-            .field("args", &self.args)
-            .finish()
-    }
-}
-
-impl Serialize for TimelineDumpSer {
-    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        let dump = self
-            .runtime
-            .block_on(build_from_tli_dump(self.tli.clone(), self.args.clone()));
-        dump.serialize(serializer)
-    }
-}
-
-async fn build_from_tli_dump(timeline: Arc<crate::timeline::Timeline>, args: Args) -> Timeline {
-    let control_file = if args.dump_control_file {
-        let mut state = timeline.get_state().await.1;
-        if !args.dump_term_history {
-            state.acceptor_state.term_history = TermHistory(vec![]);
-        }
-        Some(state)
-    } else {
-        None
-    };
-
-    let memory = if args.dump_memory {
-        Some(timeline.memory_dump().await)
-    } else {
-        None
-    };
-
-    let disk_content = if args.dump_disk_content {
-        // build_disk_content can fail, but we don't want to fail the whole
-        // request because of that.
-        build_disk_content(&timeline.timeline_dir).ok()
-    } else {
-        None
-    };
-
-    Timeline {
-        tenant_id: timeline.ttid.tenant_id,
-        timeline_id: timeline.ttid.timeline_id,
-        control_file,
-        memory,
-        disk_content,
-    }
-}
-
 /// Safekeeper configuration.
 #[derive(Debug, Serialize, Deserialize)]
 pub struct Config {
@@ -135,9 +74,12 @@ pub struct Config {
    pub wal_backup_enabled: bool,
 }

+#[serde_as]
 #[derive(Debug, Serialize, Deserialize)]
 pub struct Timeline {
+    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
+    #[serde_as(as = "DisplayFromStr")]
    pub timeline_id: TimelineId,
    pub control_file: Option<SafeKeeperState>,
    pub memory: Option<Memory>,
@@ -198,12 +140,8 @@ pub async fn build(args: Args) -> Result<Response> {
        GlobalTimelines::get_all()
    };

+    // TODO: return Stream instead of Vec
    let mut timelines = Vec::new();
-    let runtime = Arc::new(
-        tokio::runtime::Builder::new_current_thread()
-            .build()
-            .unwrap(),
-    );
    for tli in ptrs_snapshot {
        let ttid = tli.ttid;
        if let Some(tenant_id) = args.tenant_id {
@@ -217,11 +155,38 @@ pub async fn build(args: Args) -> Result<Response> {
            }
        }

-        timelines.push(TimelineDumpSer {
-            tli,
-            args: args.clone(),
-            runtime: runtime.clone(),
-        });
+        let control_file = if args.dump_control_file {
+            let mut state = tli.get_state().await.1;
+            if !args.dump_term_history {
+                state.acceptor_state.term_history = TermHistory(vec![]);
+            }
+            Some(state)
+        } else {
+            None
+        };
+
+        let memory = if args.dump_memory {
+            Some(tli.memory_dump().await)
+        } else {
+            None
+        };
+
+        let disk_content = if args.dump_disk_content {
+            // build_disk_content can fail, but we don't want to fail the whole
+            // request because of that.
+            build_disk_content(&tli.timeline_dir).ok()
+        } else {
+            None
+        };
+
+        let timeline = Timeline {
+            tenant_id: ttid.tenant_id,
+            timeline_id: ttid.timeline_id,
+            control_file,
+            memory,
+            disk_content,
+        };
+        timelines.push(timeline);
    }

    let config = GlobalTimelines::get_global_config();
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -4,6 +4,7 @@ use once_cell::sync::Lazy;
 use postgres_ffi::WAL_SEGMENT_SIZE;
 use safekeeper_api::models::SkTimelineInfo;
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use std::collections::{HashMap, HashSet};
 use std::fmt;
 use std::str::FromStr;
@@ -12,12 +13,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 use tokio::fs::File;
 use tokio::io::AsyncReadExt;
-
-use std::io::Write as _;
-use tokio::sync::mpsc;
-use tokio_stream::wrappers::ReceiverStream;
-use tracing::info_span;
-use utils::http::endpoint::{request_span, ChannelWriter};
+use utils::http::endpoint::request_span;

 use crate::receive_wal::WalReceiverState;
 use crate::safekeeper::Term;
@@ -66,9 +62,11 @@ fn get_conf(request: &Request<Body>) -> &SafeKeeperConf {

 /// Same as TermLsn, but serializes LSN using display serializer
 /// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response.
+#[serde_as]
 #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
 pub struct TermSwitchApiEntry {
    pub term: Term,
+    #[serde_as(as = "DisplayFromStr")]
    pub lsn: Lsn,
 }

@@ -90,18 +88,28 @@ pub struct AcceptorStateStatus {
 }

 /// Info about timeline on safekeeper ready for reporting.
+#[serde_as]
 #[derive(Debug, Serialize, Deserialize)]
 pub struct TimelineStatus {
+    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
+    #[serde_as(as = "DisplayFromStr")]
    pub timeline_id: TimelineId,
    pub acceptor_state: AcceptorStateStatus,
    pub pg_info: ServerInfo,
+    #[serde_as(as = "DisplayFromStr")]
    pub flush_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
    pub timeline_start_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
    pub local_start_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
    pub commit_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
    pub backup_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
    pub peer_horizon_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn: Lsn,
    pub peers: Vec<PeerInfo>,
    pub walsenders: Vec<WalSenderState>,
@@ -365,52 +373,8 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
        .await
        .map_err(ApiError::InternalServerError)?;

-    let started_at = std::time::Instant::now();
-
-    let (tx, rx) = mpsc::channel(1);
-
-    let body = Body::wrap_stream(ReceiverStream::new(rx));
-
-    let mut writer = ChannelWriter::new(128 * 1024, tx);
-
-    let response = Response::builder()
-        .status(200)
-        .header(hyper::header::CONTENT_TYPE, "application/octet-stream")
-        .body(body)
-        .unwrap();
-
-    let span = info_span!("blocking");
-    tokio::task::spawn_blocking(move || {
-        let _span = span.entered();
-
-        let res = serde_json::to_writer(&mut writer, &resp)
-            .map_err(std::io::Error::from)
-            .and_then(|_| writer.flush());
-
-        match res {
-            Ok(()) => {
-                tracing::info!(
-                    bytes = writer.flushed_bytes(),
-                    elapsed_ms = started_at.elapsed().as_millis(),
-                    "responded /v1/debug_dump"
-                );
-            }
-            Err(e) => {
-                tracing::warn!("failed to write out /v1/debug_dump response: {e:#}");
-                // semantics of this error are quite... unclear. we want to error the stream out to
-                // abort the response to somehow notify the client that we failed.
-                //
-                // though, most likely the reason for failure is that the receiver is already gone.
-                drop(
-                    writer
-                        .tx
-                        .blocking_send(Err(std::io::ErrorKind::BrokenPipe.into())),
-                );
-            }
-        }
-    });
-
-    Ok(response)
+    // TODO: use streaming response
+    json_response(StatusCode::OK, resp)
 }

 /// Safekeeper http router.
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -44,11 +44,8 @@ pub struct AppendLogicalMessage {

    // fields from AppendRequestHeader
    pub term: Term,
-    #[serde(with = "utils::lsn::serde_as_u64")]
    pub epoch_start_lsn: Lsn,
-    #[serde(with = "utils::lsn::serde_as_u64")]
    pub begin_lsn: Lsn,
-    #[serde(with = "utils::lsn::serde_as_u64")]
    pub truncate_lsn: Lsn,
    pub pg_version: u32,
 }
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -1,4 +1,3 @@
-use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize};

 use anyhow::{bail, Context, Result};
@@ -6,6 +5,8 @@ use tokio::io::AsyncWriteExt;
 use tracing::info;
 use utils::id::{TenantId, TenantTimelineId, TimelineId};

+use serde_with::{serde_as, DisplayFromStr};
+
 use crate::{
    control_file, debug_dump,
    http::routes::TimelineStatus,
@@ -14,9 +15,12 @@ use crate::{
 };

 /// Info about timeline on safekeeper ready for reporting.
+#[serde_as]
 #[derive(Debug, Serialize, Deserialize)]
 pub struct Request {
+    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
+    #[serde_as(as = "DisplayFromStr")]
    pub timeline_id: TimelineId,
    pub http_hosts: Vec<String>,
 }
@@ -28,16 +32,6 @@ pub struct Response {
    // TODO: add more fields?
 }

-/// Response for debug dump request.
-#[derive(Debug, Serialize, Deserialize)]
-pub struct DebugDumpResponse {
-    pub start_time: DateTime<Utc>,
-    pub finish_time: DateTime<Utc>,
-    pub timelines: Vec<debug_dump::Timeline>,
-    pub timelines_count: usize,
-    pub config: debug_dump::Config,
-}
-
 /// Find the most advanced safekeeper and pull timeline from it.
 pub async fn handle_request(request: Request) -> Result<Response> {
    let existing_tli = GlobalTimelines::get(TenantTimelineId::new(
@@ -109,7 +103,7 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>

    // Implementing our own scp over HTTP.
    // At first, we need to fetch list of files from safekeeper.
-    let dump: DebugDumpResponse = client
+    let dump: debug_dump::Response = client
        .get(format!(
            "{}/v1/debug_dump?dump_all=true&tenant_id={}&timeline_id={}",
            host, status.tenant_id, status.timeline_id
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -52,7 +52,7 @@ impl From<(Term, Lsn)> for TermLsn {
    }
 }

-#[derive(Clone, Serialize, Deserialize, PartialEq)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct TermHistory(pub Vec<TermLsn>);

 impl TermHistory {
@@ -178,7 +178,7 @@ impl fmt::Debug for TermHistory {
 pub type PgUuid = [u8; 16];

 /// Persistent consensus state of the acceptor.
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct AcceptorState {
    /// acceptor's last term it voted for (advanced in 1 phase)
    pub term: Term,
@@ -209,16 +209,16 @@ pub struct ServerInfo {
    pub wal_seg_size: u32,
 }

-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct PersistedPeerInfo {
    /// LSN up to which safekeeper offloaded WAL to s3.
-    pub backup_lsn: Lsn,
+    backup_lsn: Lsn,
    /// Term of the last entry.
-    pub term: Term,
+    term: Term,
    /// LSN of the last record.
-    pub flush_lsn: Lsn,
+    flush_lsn: Lsn,
    /// Up to which LSN safekeeper regards its WAL as committed.
-    pub commit_lsn: Lsn,
+    commit_lsn: Lsn,
 }

 impl PersistedPeerInfo {
@@ -232,12 +232,12 @@ impl PersistedPeerInfo {
    }
 }

-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>);

 /// Persistent information stored on safekeeper node
 /// On disk data is prefixed by magic and format version and followed by checksum.
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct SafeKeeperState {
    #[serde(with = "hex")]
    pub tenant_id: TenantId,
@@ -1096,7 +1096,7 @@ mod tests {

    use super::*;
    use crate::wal_storage::Storage;
-    use std::{ops::Deref, str::FromStr, time::Instant};
+    use std::{ops::Deref, time::Instant};

    // fake storage for tests
    struct InMemoryState {
@@ -1314,98 +1314,4 @@ mod tests {
            })
        );
    }
-
-    #[test]
-    fn test_sk_state_bincode_serde_roundtrip() {
-        use utils::Hex;
-        let tenant_id = TenantId::from_str("cf0480929707ee75372337efaa5ecf96").unwrap();
-        let timeline_id = TimelineId::from_str("112ded66422aa5e953e5440fa5427ac4").unwrap();
-        let state = SafeKeeperState {
-            tenant_id,
-            timeline_id,
-            acceptor_state: AcceptorState {
-                term: 42,
-                term_history: TermHistory(vec![TermLsn {
-                    lsn: Lsn(0x1),
-                    term: 41,
-                }]),
-            },
-            server: ServerInfo {
-                pg_version: 14,
-                system_id: 0x1234567887654321,
-                wal_seg_size: 0x12345678,
-            },
-            proposer_uuid: {
-                let mut arr = timeline_id.as_arr();
-                arr.reverse();
-                arr
-            },
-            timeline_start_lsn: Lsn(0x12345600),
-            local_start_lsn: Lsn(0x12),
-            commit_lsn: Lsn(1234567800),
-            backup_lsn: Lsn(1234567300),
-            peer_horizon_lsn: Lsn(9999999),
-            remote_consistent_lsn: Lsn(1234560000),
-            peers: PersistedPeers(vec![(
-                NodeId(1),
-                PersistedPeerInfo {
-                    backup_lsn: Lsn(1234567000),
-                    term: 42,
-                    flush_lsn: Lsn(1234567800 - 8),
-                    commit_lsn: Lsn(1234567600),
-                },
-            )]),
-        };
-
-        let ser = state.ser().unwrap();
-
-        #[rustfmt::skip]
-        let expected = [
-            // tenant_id as length prefixed hex
-            0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x63, 0x66, 0x30, 0x34, 0x38, 0x30, 0x39, 0x32, 0x39, 0x37, 0x30, 0x37, 0x65, 0x65, 0x37, 0x35, 0x33, 0x37, 0x32, 0x33, 0x33, 0x37, 0x65, 0x66, 0x61, 0x61, 0x35, 0x65, 0x63, 0x66, 0x39, 0x36,
-            // timeline_id as length prefixed hex
-            0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x31, 0x31, 0x32, 0x64, 0x65, 0x64, 0x36, 0x36, 0x34, 0x32, 0x32, 0x61, 0x61, 0x35, 0x65, 0x39, 0x35, 0x33, 0x65, 0x35, 0x34, 0x34, 0x30, 0x66, 0x61, 0x35, 0x34, 0x32, 0x37, 0x61, 0x63, 0x34,
-            // term
-            0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            // length prefix
-            0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            // unsure why this order is swapped
-            0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            // pg_version
-            0x0e, 0x00, 0x00, 0x00,
-            // systemid
-            0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12,
-            // wal_seg_size
-            0x78, 0x56, 0x34, 0x12,
-            // pguuid as length prefixed hex
-            0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x63, 0x34, 0x37, 0x61, 0x34, 0x32, 0x61, 0x35, 0x30, 0x66, 0x34, 0x34, 0x65, 0x35, 0x35, 0x33, 0x65, 0x39, 0x61, 0x35, 0x32, 0x61, 0x34, 0x32, 0x36, 0x36, 0x65, 0x64, 0x32, 0x64, 0x31, 0x31,
-
-            // timeline_start_lsn
-            0x00, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00, 0x00,
-            0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x78, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
-            0x84, 0x00, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
-            0x7f, 0x96, 0x98, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0xe4, 0x95, 0x49, 0x00, 0x00, 0x00, 0x00,
-            // length prefix for persistentpeers
-            0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            // nodeid
-            0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            // backuplsn
-            0x58, 0xff, 0x95, 0x49, 0x00, 0x00, 0x00, 0x00,
-            0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
-            0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
-        ];
-
-        assert_eq!(Hex(&ser), Hex(&expected));
-
-        let deser = SafeKeeperState::des(&ser).unwrap();
-
-        assert_eq!(deser, state);
-    }
 }
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -16,6 +16,7 @@ use postgres_ffi::get_current_timestamp;
 use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
 use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use tokio::io::{AsyncRead, AsyncWrite};
 use utils::id::TenantTimelineId;
 use utils::lsn::AtomicLsn;
@@ -312,8 +313,10 @@ impl WalSendersShared {
 }

 // Serialized is used only for pretty printing in json.
+#[serde_as]
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalSenderState {
+    #[serde_as(as = "DisplayFromStr")]
    ttid: TenantTimelineId,
    addr: SocketAddr,
    conn_id: ConnectionId,
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -5,8 +5,10 @@ use anyhow::{anyhow, bail, Result};
 use camino::Utf8PathBuf;
 use postgres_ffi::XLogSegNo;
 use serde::{Deserialize, Serialize};
+use serde_with::serde_as;
 use tokio::fs;

+use serde_with::DisplayFromStr;
 use std::cmp::max;
 use std::sync::Arc;
 use std::time::Duration;
@@ -40,6 +42,7 @@ use crate::SafeKeeperConf;
 use crate::{debug_dump, wal_storage};

 /// Things safekeeper should know about timeline state on peers.
+#[serde_as]
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct PeerInfo {
    pub sk_id: NodeId,
@@ -47,10 +50,13 @@ pub struct PeerInfo {
    /// Term of the last entry.
    pub last_log_term: Term,
    /// LSN of the last record.
+    #[serde_as(as = "DisplayFromStr")]
    pub flush_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
    pub commit_lsn: Lsn,
    /// Since which LSN safekeeper has WAL. TODO: remove this once we fill new
    /// sk since backup_lsn.
+    #[serde_as(as = "DisplayFromStr")]
    pub local_start_lsn: Lsn,
    /// When info was received. Serde annotations are not very useful but make
    /// the code compile -- we don't rely on this field externally.
--- a/scripts/benchmark_durations.py
+++ b/scripts/benchmark_durations.py
@@ -81,6 +81,7 @@ FALLBACK_DURATION = {
    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.55,
    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 12.189,
    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.899,
+    "test_runner/performance/test_startup.py::test_startup": 890.114,
    "test_runner/performance/test_startup.py::test_startup_simple": 2.51,
    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 527.245,
    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 583.46,
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2868,7 +2868,7 @@ class SafekeeperHttpClient(requests.Session):
        params = params or {}
        res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
        res.raise_for_status()
-        res_json = json.loads(res.text)
+        res_json = res.json()
        assert isinstance(res_json, dict)
        return res_json

@@ -2968,33 +2968,24 @@ class S3Scrubber:
        self.env = env
        self.log_dir = log_dir

-    def scrubber_cli(self, args: list[str], timeout) -> str:
+    def scrubber_cli(self, args, timeout):
        assert isinstance(self.env.pageserver_remote_storage, S3Storage)
        s3_storage = self.env.pageserver_remote_storage

        env = {
            "REGION": s3_storage.bucket_region,
            "BUCKET": s3_storage.bucket_name,
-            "BUCKET_PREFIX": s3_storage.prefix_in_bucket,
-            "RUST_LOG": "DEBUG",
        }
        env.update(s3_storage.access_env_vars())

        if s3_storage.endpoint is not None:
            env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint})

-        base_args = [str(self.env.neon_binpath / "s3_scrubber")]
+        base_args = [self.env.neon_binpath / "s3_scrubber"]
        args = base_args + args

-        (output_path, stdout, status_code) = subprocess_capture(
-            self.log_dir,
-            args,
-            echo_stderr=True,
-            echo_stdout=True,
-            env=env,
-            check=False,
-            capture_stdout=True,
-            timeout=timeout,
+        (output_path, _, status_code) = subprocess_capture(
+            self.log_dir, args, echo_stderr=True, echo_stdout=True, env=env, check=False
        )
        if status_code:
            log.warning(f"Scrub command {args} failed")
@@ -3003,18 +2994,8 @@ class S3Scrubber:

            raise RuntimeError("Remote storage scrub failed")

-        assert stdout is not None
-        return stdout
-
-    def scan_metadata(self) -> Any:
-        stdout = self.scrubber_cli(["scan-metadata", "--json"], timeout=30)
-
-        try:
-            return json.loads(stdout)
-        except:
-            log.error("Failed to decode JSON output from `scan-metadata`.  Dumping stdout:")
-            log.error(stdout)
-            raise
+    def scan_metadata(self):
+        self.scrubber_cli(["scan-metadata"], timeout=30)


 def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -35,7 +35,6 @@ def subprocess_capture(
    echo_stderr=False,
    echo_stdout=False,
    capture_stdout=False,
-    timeout=None,
    **kwargs: Any,
 ) -> Tuple[str, Optional[str], int]:
    """Run a process and bifurcate its output to files and the `log` logger
@@ -105,7 +104,7 @@ def subprocess_capture(
                stderr_handler = OutputHandler(p.stderr, stderr_f, echo=echo_stderr, capture=False)
                stderr_handler.start()

-                r = p.wait(timeout=timeout)
+                r = p.wait()

                stdout_handler.join()
                stderr_handler.join()
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -1,10 +1,8 @@
 from contextlib import closing

-from fixtures.benchmark_fixture import MetricReport
 from fixtures.compare_fixtures import NeonCompare, PgCompare
 from fixtures.pageserver.utils import wait_tenant_status_404
 from fixtures.pg_version import PgVersion
-from fixtures.types import Lsn


 #
@@ -20,8 +18,6 @@ from fixtures.types import Lsn
 def test_bulk_insert(neon_with_baseline: PgCompare):
    env = neon_with_baseline

-    start_lsn = Lsn(env.pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0])
-
    with closing(env.pg.connect()) as conn:
        with conn.cursor() as cur:
            cur.execute("create table huge (i int, j int);")
@@ -35,13 +31,6 @@ def test_bulk_insert(neon_with_baseline: PgCompare):
            env.report_peak_memory_use()
            env.report_size()

-    # Report amount of wal written. Useful for comparing vanilla wal format vs
-    # neon wal format, measuring neon write amplification, etc.
-    end_lsn = Lsn(env.pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0])
-    wal_written_bytes = end_lsn - start_lsn
-    wal_written_mb = round(wal_written_bytes / (1024 * 1024))
-    env.zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM)
-
    # When testing neon, also check how long it takes the pageserver to reingest the
    # wal from safekeepers. If this number is close to total runtime, then the pageserver
    # is the bottleneck.
--- a/test_runner/performance/test_startup.py
+++ b/test_runner/performance/test_startup.py
@@ -1,3 +1,6 @@
+from contextlib import closing
+
+import pytest
 import requests
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.neon_fixtures import NeonEnvBuilder
@@ -78,3 +81,49 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc

        # Imitate optimizations that console would do for the second start
        endpoint.respec(skip_pg_catalog_updates=True)
+
+
+# This test sometimes runs for longer than the global 5 minute timeout.
+@pytest.mark.timeout(900)
+def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    # Start
+    env.neon_cli.create_branch("test_startup")
+    with zenbenchmark.record_duration("startup_time"):
+        endpoint = env.endpoints.create_start("test_startup")
+        endpoint.safe_psql("select 1;")
+
+    # Restart
+    endpoint.stop_and_destroy()
+    with zenbenchmark.record_duration("restart_time"):
+        endpoint.create_start("test_startup")
+        endpoint.safe_psql("select 1;")
+
+    # Fill up
+    num_rows = 1000000  # 30 MB
+    num_tables = 100
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            for i in range(num_tables):
+                cur.execute(f"create table t_{i} (i integer);")
+                cur.execute(f"insert into t_{i} values (generate_series(1,{num_rows}));")
+
+    # Read
+    with zenbenchmark.record_duration("read_time"):
+        endpoint.safe_psql("select * from t_0;")
+
+    # Read again
+    with zenbenchmark.record_duration("second_read_time"):
+        endpoint.safe_psql("select * from t_0;")
+
+    # Restart
+    endpoint.stop_and_destroy()
+    with zenbenchmark.record_duration("restart_with_data"):
+        endpoint.create_start("test_startup")
+        endpoint.safe_psql("select 1;")
+
+    # Read
+    with zenbenchmark.record_duration("read_after_restart"):
+        endpoint.safe_psql("select * from t_0;")
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -72,7 +72,7 @@ class DdlForwardingContext:
        self.dbs: Dict[str, str] = {}
        self.roles: Dict[str, str] = {}
        self.fail = False
-        endpoint = "/test/roles_and_databases"
+        endpoint = "/management/api/v2/roles_and_databases"
        ddl_url = f"http://{host}:{port}{endpoint}"
        self.pg.configure(
            [
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -1,14 +1,11 @@
 import time

-import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnv,
    logical_replication_sync,
    wait_for_last_flush_lsn,
 )
-from fixtures.types import Lsn
-from fixtures.utils import query_scalar


 def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
@@ -150,89 +147,3 @@ COMMIT;
    endpoint.start()
    # it must be gone (but walproposer slot still exists, hence 1)
    assert endpoint.safe_psql("select count(*) from pg_replication_slots")[0][0] == 1
-
-
-# Test compute start at LSN page of which starts with contrecord
-# https://github.com/neondatabase/neon/issues/5749
-def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):
-    env = neon_simple_env
-
-    env.neon_cli.create_branch("init")
-    endpoint = env.endpoints.create_start("init")
-    tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
-    timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
-
-    cur = endpoint.connect().cursor()
-    cur.execute("create table t(key int, value text)")
-    cur.execute("CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int);")
-    cur.execute("insert into replication_example values (1, 2)")
-    cur.execute("create publication pub1 for table replication_example")
-
-    # now start subscriber
-    vanilla_pg.start()
-    vanilla_pg.safe_psql("create table t(pk integer primary key, value text)")
-    vanilla_pg.safe_psql("CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int);")
-
-    log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
-    connstr = endpoint.connstr().replace("'", "''")
-    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
-    logical_replication_sync(vanilla_pg, endpoint)
-    vanilla_pg.stop()
-
-    with endpoint.cursor() as cur:
-        # measure how much space logical message takes. Sometimes first attempt
-        # creates huge message and then it stabilizes, have no idea why.
-        for _ in range(3):
-            lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-            log.info(f"current_lsn={lsn_before}")
-            # Non-transactional logical message doesn't write WAL, only XLogInsert's
-            # it, so use transactional. Which is a bit problematic as transactional
-            # necessitates commit record. Alternatively we can do smth like
-            #   select neon_xlogflush(pg_current_wal_insert_lsn());
-            # but isn't much better + that particular call complains on 'xlog flush
-            # request 0/282C018 is not satisfied' as pg_current_wal_insert_lsn skips
-            # page headers.
-            payload = "blahblah"
-            cur.execute(f"select pg_logical_emit_message(true, 'pref', '{payload}')")
-            lsn_after_by_curr_wal_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-            lsn_diff = lsn_after_by_curr_wal_lsn - lsn_before
-            logical_message_base = lsn_after_by_curr_wal_lsn - lsn_before - len(payload)
-            log.info(
-                f"before {lsn_before}, after {lsn_after_by_curr_wal_lsn}, lsn diff is {lsn_diff}, base {logical_message_base}"
-            )
-
-        # and write logical message spanning exactly as we want
-        lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-        log.info(f"current_lsn={lsn_before}")
-        curr_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-        offs = int(curr_lsn) % 8192
-        till_page = 8192 - offs
-        payload_len = (
-            till_page - logical_message_base - 8
-        )  # not sure why 8 is here, it is deduced from experiments
-        log.info(f"current_lsn={curr_lsn}, offs {offs}, till_page {till_page}")
-
-        # payload_len above would go exactly till the page boundary; but we want contrecord, so make it slightly longer
-        payload_len += 8
-
-        cur.execute(f"select pg_logical_emit_message(true, 'pref', 'f{'a' * payload_len}')")
-        supposedly_contrecord_end = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-        log.info(f"supposedly_page_boundary={supposedly_contrecord_end}")
-        # The calculations to hit the page boundary are very fuzzy, so just
-        # ignore test if we fail to reach it.
-        if not (int(supposedly_contrecord_end) % 8192 == 32):
-            pytest.skip("missed page boundary, bad luck")
-
-        cur.execute("insert into replication_example values (2, 3)")
-
-    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
-    endpoint.stop().start()
-
-    cur = endpoint.connect().cursor()
-    # this should flush current wal page
-    cur.execute("insert into replication_example values (3, 4)")
-    vanilla_pg.start()
-    logical_replication_sync(vanilla_pg, endpoint)
-    assert vanilla_pg.safe_psql(
-        "select sum(somedata) from replication_example"
-    ) == endpoint.safe_psql("select sum(somedata) from replication_example")
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -21,7 +21,6 @@ from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
    PgBin,
-    S3Scrubber,
    last_flush_lsn_upload,
    wait_for_last_flush_lsn,
 )
@@ -235,22 +234,8 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
    assert len(suffixed_objects) > 0
    assert len(legacy_objects) > 0

-    # Flush through deletions to get a clean state for scrub: we are implicitly validating
-    # that our generations-enabled pageserver was able to do deletions of layers
-    # from earlier which don't have a generation.
-    env.pageserver.http_client().deletion_queue_flush(execute=True)
-
    assert get_deletion_queue_unexpected_errors(env.pageserver.http_client()) == 0

-    # Having written a mixture of generation-aware and legacy index_part.json,
-    # ensure the scrubber handles the situation as expected.
-    metadata_summary = S3Scrubber(
-        neon_env_builder.test_output_dir, neon_env_builder
-    ).scan_metadata()
-    assert metadata_summary["count"] == 1  # Scrubber should have seen our timeline
-    assert not metadata_summary["with_errors"]
-    assert not metadata_summary["with_warnings"]
-

 def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.enable_generations = True
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -432,47 +432,3 @@ def test_sql_over_http_pool_idle(static_proxy: NeonProxy):
    query(200, "BEGIN")
    pid2 = query(200, GET_CONNECTION_PID_QUERY)["rows"][0]["pid"]
    assert pid1 != pid2
-
-
-@pytest.mark.timeout(60)
-def test_sql_over_http_pool_dos(static_proxy: NeonProxy):
-    static_proxy.safe_psql("create user http_auth with password 'http' superuser")
-
-    static_proxy.safe_psql("CREATE TYPE foo AS ENUM ('foo')")
-
-    def query(status: int, query: str) -> Any:
-        return static_proxy.http_query(
-            query,
-            [],
-            user="http_auth",
-            password="http",
-            expected_code=status,
-        )
-
-    # query generates a million rows - should hit the 10MB reponse limit quickly
-    response = query(
-        400,
-        "select * from generate_series(1, 5000) a cross join generate_series(1, 5000) b cross join (select 'foo'::foo) c;",
-    )
-    assert "response is too large (max is 10485760 bytes)" in response["message"]
-
-
-def test_sql_over_http_pool_custom_types(static_proxy: NeonProxy):
-    static_proxy.safe_psql("create user http_auth with password 'http' superuser")
-
-    static_proxy.safe_psql("CREATE TYPE foo AS ENUM ('foo','bar','baz')")
-
-    def query(status: int, query: str) -> Any:
-        return static_proxy.http_query(
-            query,
-            [],
-            user="http_auth",
-            password="http",
-            expected_code=status,
-        )
-
-    response = query(
-        200,
-        "select array['foo'::foo, 'bar'::foo, 'baz'::foo] as data",
-    )
-    assert response["rows"][0]["data"] == ["foo", "bar", "baz"]
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-    "postgres-v16": "763000f1d0873b827829c41f2f6f799ffc0de55c",
-    "postgres-v15": "bc88f539312fcc4bb292ce94ae9db09ab6656e8a",
-    "postgres-v14": "dd067cf656f6810a25aca6025633d32d02c5085a"
+    "postgres-v16": "550ffa6495a5dc62fccc3a8b449386633758680b",
+    "postgres-v15": "ab67ab96355d61e9d0218630be4aa7db53bf83e7",
+    "postgres-v14": "6669a672ee14ab2c09d44c4552f9a13fad3afc10"
 }
Author	SHA1	Message	Date
Bojan Serafimov	b1de46c18d	wip	2023-11-01 20:50:20 -04:00
Bojan Serafimov	88064d8c1d	wip	2023-11-01 17:13:56 -04:00