wip

2026-07-31 01:40:39 +00:00 · 2023-11-01 20:50:20 -04:00 · 2023-11-01 17:13:56 -04:00
161 changed files with 4322 additions and 7081 deletions
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -22,11 +22,5 @@ platforms = [
    # "x86_64-pc-windows-msvc",
 ]

-[final-excludes]
-# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
-# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
-# from depending on workspace-hack because most of the dependencies are not used.
-workspace-members = ["vm_monitor"]
-
 # Write out exact versions rather than a semver range. (Defaults to false.)
 # exact-versions = true
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -17,9 +17,8 @@ assignees: ''
 ## Implementation ideas


-```[tasklist]
-### Tasks
-```
+## Tasks
+- [ ]


 ## Other related tasks and Epics
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -723,7 +723,6 @@ jobs:
                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
                           --context .
                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-                           --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
                           --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -170,12 +170,6 @@ dependencies = [
 "backtrace",
 ]

-[[package]]
-name = "arc-swap"
-version = "1.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bddcadddf5e9015d310179a59bb28c4d4b9920ad0f11e8e14dbadf654890c9a6"
-
 [[package]]
 name = "archery"
 version = "0.5.0"
@@ -3556,7 +3550,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -3569,7 +3563,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "native-tls",
 "tokio",
@@ -3580,7 +3574,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -3598,7 +3592,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4064,7 +4058,6 @@ dependencies = [
 "aws-config",
 "aws-credential-types",
 "aws-sdk-s3",
- "aws-smithy-async",
 "aws-smithy-http",
 "aws-types",
 "azure_core",
@@ -4426,7 +4419,6 @@ dependencies = [
 "itertools",
 "pageserver",
 "rand 0.8.5",
- "remote_storage",
 "reqwest",
 "serde",
 "serde_json",
@@ -4485,7 +4477,6 @@ dependencies = [
 "tokio",
 "tokio-io-timeout",
 "tokio-postgres",
- "tokio-stream",
 "toml_edit",
 "tracing",
 "url",
@@ -4688,16 +4679,6 @@ dependencies = [
 "serde_derive",
 ]

-[[package]]
-name = "serde_assert"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eda563240c1288b044209be1f0d38bb4d15044fb3e00dc354fbc922ab4733e80"
-dependencies = [
- "hashbrown 0.13.2",
- "serde",
-]
-
 [[package]]
 name = "serde_derive"
 version = "1.0.183"
@@ -5415,7 +5396,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -5958,7 +5939,6 @@ name = "utils"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "arc-swap",
 "async-trait",
 "bincode",
 "byteorder",
@@ -5985,7 +5965,6 @@ dependencies = [
 "routerify",
 "sentry",
 "serde",
- "serde_assert",
 "serde_json",
 "serde_with",
 "signal-hook",
@@ -6056,6 +6035,7 @@ dependencies = [
 "tokio-util",
 "tracing",
 "tracing-subscriber",
+ "workspace_hack",
 ]

 [[package]]
@@ -6483,7 +6463,6 @@ dependencies = [
 "clap",
 "clap_builder",
 "crossbeam-utils",
- "dashmap",
 "either",
 "fail",
 "futures",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -36,7 +36,6 @@ license = "Apache-2.0"
 ## All dependency versions, used in the project
 [workspace.dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
-arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
 azure_core = "0.16"
 azure_identity = "0.16"
@@ -48,7 +47,6 @@ async-trait = "0.1"
 aws-config = { version = "0.56", default-features = false, features=["rustls"] }
 aws-sdk-s3 = "0.29"
 aws-smithy-http = "0.56"
-aws-smithy-async = { version = "0.56", default-features = false, features=["rt-tokio"] }
 aws-credential-types = "0.56"
 aws-types = "0.56"
 axum = { version = "0.6.20", features = ["ws"] }
@@ -67,7 +65,7 @@ comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
 crossbeam-utils = "0.8.5"
-dashmap = { version = "5.5.0", features = ["raw-api"] }
+dashmap = "5.5.0"
 either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
@@ -126,7 +124,6 @@ sentry = { version = "0.31", default-features = false, features = ["backtrace",
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_with = "2.0"
-serde_assert = "0.5.0"
 sha2 = "0.10.2"
 signal-hook = "0.3"
 smallvec = "1.11"
@@ -164,11 +161,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -205,7 +202,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }

 ################# Binary contents sections

--- a/5
+++ b/5
@@ -27,7 +27,6 @@ RUN set -e \
 FROM $REPOSITORY/$IMAGE:$TAG AS build
 WORKDIR /home/nonroot
 ARG GIT_VERSION=local
-ARG BUILD_TAG

 # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
 # Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
@@ -79,9 +78,9 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl             /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker         /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local               /usr/local/bin

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
--- a/4
+++ b/4
@@ -72,10 +72,6 @@ neon: postgres-headers walproposer-lib
 #
 $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 	+@echo "Configuring Postgres $* build"
-	@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
-		echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
-		echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
-		exit 1; }
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
 	(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
 	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -283,6 +283,7 @@ fn main() -> Result<()> {
                .expect("--vm-monitor-addr should always be set because it has a default arg");
            let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
            let cgroup = matches.get_one::<String>("cgroup");
+            let file_cache_on_disk = matches.get_flag("file-cache-on-disk");

            // Only make a runtime if we need to.
            // Note: it seems like you can make a runtime in an inner scope and
@@ -309,6 +310,7 @@ fn main() -> Result<()> {
                        cgroup: cgroup.cloned(),
                        pgconnstr: file_cache_connstr.cloned(),
                        addr: vm_monitor_addr.clone(),
+                        file_cache_on_disk,
                    })),
                    token.clone(),
                ))
@@ -480,8 +482,6 @@ fn cli() -> clap::Command {
                .value_name("FILECACHE_CONNSTR"),
        )
        .arg(
-            // DEPRECATED, NO LONGER DOES ANYTHING.
-            // See https://github.com/neondatabase/cloud/issues/7516
            Arg::new("file-cache-on-disk")
                .long("file-cache-on-disk")
                .action(clap::ArgAction::SetTrue),
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -710,12 +710,8 @@ impl ComputeNode {
    // `pg_ctl` for start / stop, so this just seems much easier to do as we already
    // have opened connection to Postgres and superuser access.
    #[instrument(skip_all)]
-    fn pg_reload_conf(&self) -> Result<()> {
-        let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
-        Command::new(pgctl_bin)
-            .args(["reload", "-D", &self.pgdata])
-            .output()
-            .expect("cannot run pg_ctl process");
+    fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
+        client.simple_query("SELECT pg_reload_conf()")?;
        Ok(())
    }

@@ -728,9 +724,9 @@ impl ComputeNode {
        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
-        self.pg_reload_conf()?;

        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
+        self.pg_reload_conf(&mut client)?;

        // Proceed with post-startup configuration. Note, that order of operations is important.
        // Disable DDL forwarding because control plane already knows about these roles/databases.
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -78,7 +78,7 @@ use regex::Regex;
 use remote_storage::*;
 use serde_json;
 use std::io::Read;
-use std::num::NonZeroUsize;
+use std::num::{NonZeroU32, NonZeroUsize};
 use std::path::Path;
 use std::str;
 use tar::Archive;
@@ -281,6 +281,8 @@ pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRem
        max_keys_per_list_response: None,
    };
    let config = RemoteStorageConfig {
+        max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
+        max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
        storage: RemoteStorageKind::AwsS3(config),
    };
    GenericRemoteStorage::from_config(&config)
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -1,7 +1,7 @@
+//!
 //! Various tools and helpers to handle cluster / compute node (Postgres)
 //! configuration.
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
+//!
 pub mod checker;
 pub mod config;
 pub mod configurator;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -68,7 +68,7 @@ pub fn get_spec_from_control_plane(
    base_uri: &str,
    compute_id: &str,
 ) -> Result<Option<ComputeSpec>> {
-    let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
+    let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec");
    let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
        Ok(v) => v,
        Err(_) => "".to_string(),
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -2,6 +2,7 @@ use crate::{background_process, local_env::LocalEnv};
 use anyhow::anyhow;
 use camino::Utf8PathBuf;
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use std::{path::PathBuf, process::Child};
 use utils::id::{NodeId, TenantId};

@@ -13,8 +14,10 @@ pub struct AttachmentService {

 const COMMAND: &str = "attachment_service";

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
+    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
    pub node_id: Option<NodeId>,
 }
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -262,7 +262,7 @@ where
    P: Into<Utf8PathBuf>,
 {
    let path: Utf8PathBuf = path.into();
-    // SAFETY:
+    // SAFETY
    // pre_exec is marked unsafe because it runs between fork and exec.
    // Why is that dangerous in various ways?
    // Long answer:  https://github.com/rust-lang/rust/issues/39575
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -46,6 +46,7 @@ use std::time::Duration;

 use anyhow::{anyhow, bail, Context, Result};
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{NodeId, TenantId, TimelineId};

 use crate::local_env::LocalEnv;
@@ -56,10 +57,13 @@ use compute_api::responses::{ComputeState, ComputeStatus};
 use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};

 // contents of a endpoint.json file
+#[serde_as]
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct EndpointConf {
    endpoint_id: String,
+    #[serde_as(as = "DisplayFromStr")]
    tenant_id: TenantId,
+    #[serde_as(as = "DisplayFromStr")]
    timeline_id: TimelineId,
    mode: ComputeMode,
    pg_port: u16,
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -1,10 +1,11 @@
-//! Local control plane.
-//!
-//! Can start, configure and stop postgres instances running as a local processes.
-//!
-//! Intended to be used in integration tests and in CLI tools for
-//! local installations.
-#![deny(clippy::undocumented_unsafe_blocks)]
+//
+// Local control plane.
+//
+// Can start, configure and stop postgres instances running as a local processes.
+//
+// Intended to be used in integration tests and in CLI tools for
+// local installations.
+//

 pub mod attachment_service;
 mod background_process;
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -8,6 +8,7 @@ use anyhow::{bail, ensure, Context};
 use postgres_backend::AuthType;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
 use std::env;
 use std::fs;
@@ -32,6 +33,7 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
 // to 'neon_local init --config=<path>' option. See control_plane/simple.conf for
 // an example.
 //
+#[serde_as]
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct LocalEnv {
    // Base directory for all the nodes (the pageserver, safekeepers and
@@ -57,6 +59,7 @@ pub struct LocalEnv {
    // Default tenant ID to use with the 'neon_local' command line utility, when
    // --tenant_id is not explicitly specified.
    #[serde(default)]
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub default_tenant_id: Option<TenantId>,

    // used to issue tokens during e.g pg start
@@ -81,6 +84,7 @@ pub struct LocalEnv {
    // A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
    // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
    // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
+    #[serde_as(as = "HashMap<_, Vec<(DisplayFromStr, DisplayFromStr)>>")]
    branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
 }

--- a/libs/compute_api/src/lib.rs
+++ b/libs/compute_api/src/lib.rs
@@ -1,5 +1,3 @@
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
 pub mod requests;
 pub mod responses;
 pub mod spec;
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -6,6 +6,7 @@
 use std::collections::HashMap;

 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

@@ -18,6 +19,7 @@ pub type PgIdent = String;

 /// Cluster spec or configuration represented as an optional number of
 /// delta operations + final cluster state description.
+#[serde_as]
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct ComputeSpec {
    pub format_version: f32,
@@ -48,12 +50,12 @@ pub struct ComputeSpec {
    // these, and instead set the "neon.tenant_id", "neon.timeline_id",
    // etc. GUCs in cluster.settings. TODO: Once the control plane has been
    // updated to fill these fields, we can make these non optional.
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub tenant_id: Option<TenantId>,
-
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub timeline_id: Option<TimelineId>,
-
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub pageserver_connstring: Option<String>,
-
    #[serde(default)]
    pub safekeeper_connstrings: Vec<String>,

@@ -138,13 +140,14 @@ impl RemoteExtSpec {
    }
 }

+#[serde_as]
 #[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
 pub enum ComputeMode {
    /// A read-write node
    #[default]
    Primary,
    /// A read-only node, pinned at a particular LSN
-    Static(Lsn),
+    Static(#[serde_as(as = "DisplayFromStr")] Lsn),
    /// A read-only node that follows the tip of the branch in hot standby mode
    ///
    /// Future versions may want to distinguish between replicas with hot standby
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -1,6 +1,6 @@
+//!
 //! Shared code for consumption metics collection
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
+//!
 use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::{Deserialize, Serialize};
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -2,7 +2,6 @@
 //! make sure that we use the same dep version everywhere.
 //! Otherwise, we might not see all metrics registered via
 //! a default registry.
-#![deny(clippy::undocumented_unsafe_blocks)]
 use once_cell::sync::Lazy;
 use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
 pub use prometheus::opts;
--- a/libs/pageserver_api/src/control_api.rs
+++ b/libs/pageserver_api/src/control_api.rs
@@ -4,6 +4,7 @@
 //! See docs/rfcs/025-generation-numbers.md

 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{NodeId, TenantId};

 #[derive(Serialize, Deserialize)]
@@ -11,8 +12,10 @@ pub struct ReAttachRequest {
    pub node_id: NodeId,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachResponseTenant {
+    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    pub gen: u32,
 }
@@ -22,8 +25,10 @@ pub struct ReAttachResponse {
    pub tenants: Vec<ReAttachResponseTenant>,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ValidateRequestTenant {
+    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    pub gen: u32,
 }
@@ -38,8 +43,10 @@ pub struct ValidateResponse {
    pub tenants: Vec<ValidateResponseTenant>,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ValidateResponseTenant {
+    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    pub valid: bool,
 }
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -1,5 +1,3 @@
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;

 /// Public API types
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -6,7 +6,7 @@ use std::{

 use byteorder::{BigEndian, ReadBytesExt};
 use serde::{Deserialize, Serialize};
-use serde_with::serde_as;
+use serde_with::{serde_as, DisplayFromStr};
 use strum_macros;
 use utils::{
    completion,
@@ -174,19 +174,25 @@ pub enum TimelineState {
    Broken { reason: String, backtrace: String },
 }

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
+    #[serde_as(as = "DisplayFromStr")]
    pub new_timeline_id: TimelineId,
    #[serde(default)]
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_timeline_id: Option<TimelineId>,
    #[serde(default)]
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_start_lsn: Option<Lsn>,
    pub pg_version: Option<u32>,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantCreateRequest {
+    #[serde_as(as = "DisplayFromStr")]
    pub new_tenant_id: TenantId,
    #[serde(default)]
    #[serde(skip_serializing_if = "Option::is_none")]
@@ -195,6 +201,7 @@ pub struct TenantCreateRequest {
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

+#[serde_as]
 #[derive(Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLoadRequest {
@@ -271,26 +278,31 @@ pub struct LocationConfig {
    pub tenant_conf: TenantConfig,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 #[serde(transparent)]
-pub struct TenantCreateResponse(pub TenantId);
+pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub TenantId);

 #[derive(Serialize)]
 pub struct StatusResponse {
    pub id: NodeId,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
+    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
    #[serde(flatten)]
    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

+#[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantConfigRequest {
+    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
    #[serde(flatten)]
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
@@ -362,8 +374,10 @@ pub enum TenantAttachmentStatus {
    Failed { reason: String },
 }

+#[serde_as]
 #[derive(Serialize, Deserialize, Clone)]
 pub struct TenantInfo {
+    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
    pub state: TenantState,
@@ -374,22 +388,33 @@ pub struct TenantInfo {
 }

 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
+#[serde_as]
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
+    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
+    #[serde_as(as = "DisplayFromStr")]
    pub timeline_id: TimelineId,

+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_timeline_id: Option<TimelineId>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_lsn: Option<Lsn>,
+    #[serde_as(as = "DisplayFromStr")]
    pub last_record_lsn: Lsn,
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub prev_record_lsn: Option<Lsn>,
+    #[serde_as(as = "DisplayFromStr")]
    pub latest_gc_cutoff_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
    pub disk_consistent_lsn: Lsn,

    /// The LSN that we have succesfully uploaded to remote storage
+    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn: Lsn,

    /// The LSN that we are advertizing to safekeepers
+    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn_visible: Lsn,

    pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
@@ -401,6 +426,7 @@ pub struct TimelineInfo {
    pub timeline_dir_layer_file_size_sum: Option<u64>,

    pub wal_source_connstr: Option<String>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub last_received_msg_lsn: Option<Lsn>,
    /// the timestamp (in microseconds) of the last received message
    pub last_received_msg_ts: Option<u128>,
@@ -497,13 +523,23 @@ pub struct LayerAccessStats {
    pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
 }

+#[serde_as]
 #[derive(Debug, Clone, Serialize)]
 #[serde(tag = "kind")]
 pub enum InMemoryLayerInfo {
-    Open { lsn_start: Lsn },
-    Frozen { lsn_start: Lsn, lsn_end: Lsn },
+    Open {
+        #[serde_as(as = "DisplayFromStr")]
+        lsn_start: Lsn,
+    },
+    Frozen {
+        #[serde_as(as = "DisplayFromStr")]
+        lsn_start: Lsn,
+        #[serde_as(as = "DisplayFromStr")]
+        lsn_end: Lsn,
+    },
 }

+#[serde_as]
 #[derive(Debug, Clone, Serialize)]
 #[serde(tag = "kind")]
 pub enum HistoricLayerInfo {
@@ -511,7 +547,9 @@ pub enum HistoricLayerInfo {
        layer_file_name: String,
        layer_file_size: u64,

+        #[serde_as(as = "DisplayFromStr")]
        lsn_start: Lsn,
+        #[serde_as(as = "DisplayFromStr")]
        lsn_end: Lsn,
        remote: bool,
        access_stats: LayerAccessStats,
@@ -520,6 +558,7 @@ pub enum HistoricLayerInfo {
        layer_file_name: String,
        layer_file_size: u64,

+        #[serde_as(as = "DisplayFromStr")]
        lsn_start: Lsn,
        remote: bool,
        access_stats: LayerAccessStats,
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -2,8 +2,6 @@
 //! To use, create PostgresBackend and run() it, passing the Handler
 //! implementation determining how to process the queries. Currently its API
 //! is rather narrow, but we can extend it once required.
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::Context;
 use bytes::Bytes;
 use futures::pin_mut;
@@ -17,7 +15,7 @@ use std::{fmt, io};
 use std::{future::Future, str::FromStr};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
-use tracing::{debug, error, info, trace, warn};
+use tracing::{debug, error, info, trace};

 use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
 use pq_proto::{
@@ -35,11 +33,6 @@ pub enum QueryError {
    /// We were instructed to shutdown while processing the query
    #[error("Shutting down")]
    Shutdown,
-    /// Authentication failure
-    #[error("Unauthorized: {0}")]
-    Unauthorized(std::borrow::Cow<'static, str>),
-    #[error("Simulated Connection Error")]
-    SimulatedConnectionError,
    /// Some other error
    #[error(transparent)]
    Other(#[from] anyhow::Error),
@@ -54,9 +47,8 @@ impl From<io::Error> for QueryError {
 impl QueryError {
    pub fn pg_error_code(&self) -> &'static [u8; 5] {
        match self {
-            Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
+            Self::Disconnected(_) => b"08006", // connection failure
            Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
-            Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
        }
    }
@@ -616,7 +608,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {

                    if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
                        self.write_message_noflush(&BeMessage::ErrorResponse(
-                            &short_error(&e),
+                            &e.to_string(),
                            Some(e.pg_error_code()),
                        ))?;
                        return Err(e);
@@ -736,20 +728,12 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {

                trace!("got query {query_string:?}");
                if let Err(e) = handler.process_query(self, query_string).await {
-                    match e {
-                        QueryError::Shutdown => return Ok(ProcessMsgResult::Break),
-                        QueryError::SimulatedConnectionError => {
-                            return Err(QueryError::SimulatedConnectionError)
-                        }
-                        e => {
-                            log_query_error(query_string, &e);
-                            let short_error = short_error(&e);
-                            self.write_message_noflush(&BeMessage::ErrorResponse(
-                                &short_error,
-                                Some(e.pg_error_code()),
-                            ))?;
-                        }
-                    }
+                    log_query_error(query_string, &e);
+                    let short_error = short_error(&e);
+                    self.write_message_noflush(&BeMessage::ErrorResponse(
+                        &short_error,
+                        Some(e.pg_error_code()),
+                    ))?;
                }
                self.write_message_noflush(&BeMessage::ReadyForQuery)?;
            }
@@ -975,8 +959,6 @@ pub fn short_error(e: &QueryError) -> String {
    match e {
        QueryError::Disconnected(connection_error) => connection_error.to_string(),
        QueryError::Shutdown => "shutdown".to_string(),
-        QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
-        QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
        QueryError::Other(e) => format!("{e:#}"),
    }
 }
@@ -993,15 +975,9 @@ fn log_query_error(query: &str, e: &QueryError) {
        QueryError::Disconnected(other_connection_error) => {
            error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
        }
-        QueryError::SimulatedConnectionError => {
-            error!("query handler for query '{query}' failed due to a simulated connection error")
-        }
        QueryError::Shutdown => {
            info!("query handler for '{query}' cancelled during tenant shutdown")
        }
-        QueryError::Unauthorized(e) => {
-            warn!("query handler for '{query}' failed with authentication error: {e}");
-        }
        QueryError::Other(e) => {
            error!("query handler for '{query}' failed: {e:?}");
        }
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -1,5 +1,3 @@
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::{bail, Context};
 use itertools::Itertools;
 use std::borrow::Cow;
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -8,7 +8,6 @@
 // modules included with the postgres_ffi macro depend on the types of the specific version's
 // types, and trigger a too eager lint.
 #![allow(clippy::duplicate_mod)]
-#![deny(clippy::undocumented_unsafe_blocks)]

 use bytes::Bytes;
 use utils::bin_ser::SerializeError;
@@ -21,7 +20,6 @@ macro_rules! postgres_ffi {
            pub mod bindings {
                // bindgen generates bindings for a lot of stuff we don't need
                #![allow(dead_code)]
-                #![allow(clippy::undocumented_unsafe_blocks)]

                use serde::{Deserialize, Serialize};
                include!(concat!(
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -1,7 +1,6 @@
 //! Postgres protocol messages serialization-deserialization. See
 //! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
 //! on message formats.
-#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod framed;

--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -8,7 +8,6 @@ license.workspace = true
 anyhow.workspace = true
 async-trait.workspace = true
 once_cell.workspace = true
-aws-smithy-async.workspace = true
 aws-smithy-http.workspace = true
 aws-types.workspace = true
 aws-config.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -1,18 +1,21 @@
 //! Azure Blob Storage wrapper

-use std::collections::HashMap;
 use std::env;
 use std::num::NonZeroU32;
 use std::sync::Arc;
-use std::{borrow::Cow, io::Cursor};
+use std::{borrow::Cow, collections::HashMap, io::Cursor};

 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Result;
 use azure_core::request_options::{MaxResults, Metadata, Range};
+use azure_core::Header;
 use azure_identity::DefaultAzureCredential;
 use azure_storage::StorageCredentials;
 use azure_storage_blobs::prelude::ClientBuilder;
-use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
+use azure_storage_blobs::{
+    blob::operations::GetBlobBuilder,
+    prelude::{BlobClient, ContainerClient},
+};
 use futures_util::StreamExt;
 use http_types::StatusCode;
 use tokio::io::AsyncRead;
@@ -109,19 +112,16 @@ impl AzureBlobStorage {

    async fn download_for_builder(
        &self,
+        metadata: StorageMetadata,
        builder: GetBlobBuilder,
    ) -> Result<Download, DownloadError> {
        let mut response = builder.into_stream();

-        let mut metadata = HashMap::new();
        // TODO give proper streaming response instead of buffering into RAM
        // https://github.com/neondatabase/neon/issues/5563
        let mut buf = Vec::new();
        while let Some(part) = response.next().await {
            let part = part.map_err(to_download_error)?;
-            if let Some(blob_meta) = part.blob.metadata {
-                metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
-            }
            let data = part
                .data
                .collect()
@@ -131,9 +131,28 @@ impl AzureBlobStorage {
        }
        Ok(Download {
            download_stream: Box::pin(Cursor::new(buf)),
-            metadata: Some(StorageMetadata(metadata)),
+            metadata: Some(metadata),
        })
    }
+    // TODO get rid of this function once we have metadata included in the response
+    // https://github.com/Azure/azure-sdk-for-rust/issues/1439
+    async fn get_metadata(
+        &self,
+        blob_client: &BlobClient,
+    ) -> Result<StorageMetadata, DownloadError> {
+        let builder = blob_client.get_metadata();
+
+        let response = builder.into_future().await.map_err(to_download_error)?;
+        let mut map = HashMap::new();
+
+        for md in response.metadata.iter() {
+            map.insert(
+                md.name().as_str().to_string(),
+                md.value().as_str().to_string(),
+            );
+        }
+        Ok(StorageMetadata(map))
+    }

    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
        self.concurrency_limiter
@@ -250,9 +269,11 @@ impl RemoteStorage for AzureBlobStorage {
        let _permit = self.permit(RequestKind::Get).await;
        let blob_client = self.client.blob_client(self.relative_path_to_name(from));

+        let metadata = self.get_metadata(&blob_client).await?;
+
        let builder = blob_client.get();

-        self.download_for_builder(builder).await
+        self.download_for_builder(metadata, builder).await
    }

    async fn download_byte_range(
@@ -264,6 +285,8 @@ impl RemoteStorage for AzureBlobStorage {
        let _permit = self.permit(RequestKind::Get).await;
        let blob_client = self.client.blob_client(self.relative_path_to_name(from));

+        let metadata = self.get_metadata(&blob_client).await?;
+
        let mut builder = blob_client.get();

        if let Some(end_exclusive) = end_exclusive {
@@ -278,7 +301,7 @@ impl RemoteStorage for AzureBlobStorage {
            builder = builder.range(Range::new(start_inclusive, end_exclusive));
        }

-        self.download_for_builder(builder).await
+        self.download_for_builder(metadata, builder).await
    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -6,15 +6,19 @@
 //!   * [`s3_bucket`] uses AWS S3 bucket as an external storage
 //!   * [`azure_blob`] allows to use Azure Blob storage as an external storage
 //!
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]

 mod azure_blob;
 mod local_fs;
 mod s3_bucket;
 mod simulate_failures;

-use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};
+use std::{
+    collections::HashMap,
+    fmt::Debug,
+    num::{NonZeroU32, NonZeroUsize},
+    pin::Pin,
+    sync::Arc,
+};

 use anyhow::{bail, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -30,6 +34,12 @@ pub use self::{
 };
 use s3_bucket::RequestKind;

+/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
+/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
+/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
+/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
+pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
+pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
 /// ~200 RPS for IAM services
 /// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
@@ -431,6 +441,10 @@ pub struct StorageMetadata(HashMap<String, String>);
 /// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
+    /// Max allowed number of concurrent sync operations between the API user and the remote storage.
+    pub max_concurrent_syncs: NonZeroUsize,
+    /// Max allowed errors before the sync task is considered failed and evicted.
+    pub max_sync_errors: NonZeroU32,
    /// The storage connection configuration.
    pub storage: RemoteStorageKind,
 }
@@ -526,6 +540,18 @@ impl RemoteStorageConfig {

        let use_azure = container_name.is_some() && container_region.is_some();

+        let max_concurrent_syncs = NonZeroUsize::new(
+            parse_optional_integer("max_concurrent_syncs", toml)?
+                .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS),
+        )
+        .context("Failed to parse 'max_concurrent_syncs' as a positive integer")?;
+
+        let max_sync_errors = NonZeroU32::new(
+            parse_optional_integer("max_sync_errors", toml)?
+                .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
+        )
+        .context("Failed to parse 'max_sync_errors' as a positive integer")?;
+
        let default_concurrency_limit = if use_azure {
            DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
        } else {
@@ -607,7 +633,11 @@ impl RemoteStorageConfig {
            }
        };

-        Ok(Some(RemoteStorageConfig { storage }))
+        Ok(Some(RemoteStorageConfig {
+            max_concurrent_syncs,
+            max_sync_errors,
+            storage,
+        }))
    }
 }

--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,27 +4,23 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.

-use std::{borrow::Cow, sync::Arc};
+use std::borrow::Cow;

 use anyhow::Context;
 use aws_config::{
    environment::credentials::EnvironmentVariableCredentialsProvider,
-    imds::credentials::ImdsCredentialsProvider,
-    meta::credentials::CredentialsProviderChain,
-    provider_config::ProviderConfig,
-    retry::{RetryConfigBuilder, RetryMode},
-    web_identity_token::WebIdentityTokenCredentialsProvider,
+    imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
+    provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider,
 };
 use aws_credential_types::cache::CredentialsCache;
 use aws_sdk_s3::{
-    config::{AsyncSleep, Config, Region, SharedAsyncSleep},
+    config::{Config, Region},
    error::SdkError,
    operation::get_object::GetObjectError,
    primitives::ByteStream,
    types::{Delete, ObjectIdentifier},
    Client,
 };
-use aws_smithy_async::rt::sleep::TokioSleep;
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
 use scopeguard::ScopeGuard;
@@ -87,23 +83,10 @@ impl S3Bucket {
            .or_else("imds", ImdsCredentialsProvider::builder().build())
        };

-        // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
-        let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
-
-        // We do our own retries (see [`backoff::retry`]).  However, for the AWS SDK to enable rate limiting in response to throttling
-        // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config.  We set it to use at most one
-        // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
-        let mut retry_config = RetryConfigBuilder::new();
-        retry_config
-            .set_max_attempts(Some(1))
-            .set_mode(Some(RetryMode::Adaptive));
-
        let mut config_builder = Config::builder()
            .region(region)
            .credentials_cache(CredentialsCache::lazy())
-            .credentials_provider(credentials_provider)
-            .sleep_impl(SharedAsyncSleep::from(sleep_impl))
-            .retry_config(retry_config.build());
+            .credentials_provider(credentials_provider);

        if let Some(custom_endpoint) = aws_config.endpoint.clone() {
            config_builder = config_builder
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -1,6 +1,6 @@
 use std::collections::HashSet;
 use std::env;
-use std::num::NonZeroUsize;
+use std::num::{NonZeroU32, NonZeroUsize};
 use std::ops::ControlFlow;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -469,6 +469,8 @@ fn create_azure_client(
    let random = rand::thread_rng().gen::<u32>();

    let remote_storage_config = RemoteStorageConfig {
+        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
+        max_sync_errors: NonZeroU32::new(5).unwrap(),
        storage: RemoteStorageKind::AzureContainer(AzureConfig {
            container_name: remote_storage_azure_container,
            container_region: remote_storage_azure_region,
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -1,6 +1,6 @@
 use std::collections::HashSet;
 use std::env;
-use std::num::NonZeroUsize;
+use std::num::{NonZeroU32, NonZeroUsize};
 use std::ops::ControlFlow;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -396,6 +396,8 @@ fn create_s3_client(
    let random = rand::thread_rng().gen::<u32>();

    let remote_storage_config = RemoteStorageConfig {
+        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
+        max_sync_errors: NonZeroU32::new(5).unwrap(),
        storage: RemoteStorageKind::AwsS3(S3Config {
            bucket_name: remote_storage_s3_bucket,
            bucket_region: remote_storage_s3_region,
--- a/libs/safekeeper_api/src/lib.rs
+++ b/libs/safekeeper_api/src/lib.rs
@@ -1,5 +1,3 @@
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;

 /// Public API types
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -1,18 +1,23 @@
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};

 use utils::{
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
 };

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
+    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
+    #[serde_as(as = "DisplayFromStr")]
    pub timeline_id: TimelineId,
    pub peer_ids: Option<Vec<NodeId>>,
    pub pg_version: u32,
    pub system_id: Option<u64>,
    pub wal_seg_size: Option<u32>,
+    #[serde_as(as = "DisplayFromStr")]
    pub commit_lsn: Lsn,
    // If not passed, it is assigned to the beginning of commit_lsn segment.
    pub local_start_lsn: Option<Lsn>,
@@ -23,6 +28,7 @@ fn lsn_invalid() -> Lsn {
 }

 /// Data about safekeeper's timeline, mirrors broker.proto.
+#[serde_as]
 #[derive(Debug, Clone, Deserialize, Serialize)]
 pub struct SkTimelineInfo {
    /// Term.
@@ -30,19 +36,25 @@ pub struct SkTimelineInfo {
    /// Term of the last entry.
    pub last_log_term: Option<u64>,
    /// LSN of the last record.
+    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub flush_lsn: Lsn,
    /// Up to which LSN safekeeper regards its WAL as committed.
+    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub commit_lsn: Lsn,
    /// LSN up to which safekeeper has backed WAL.
+    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub backup_lsn: Lsn,
    /// LSN of last checkpoint uploaded by pageserver.
+    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub remote_consistent_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub peer_horizon_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub local_start_lsn: Lsn,
    /// A connection string to use for WAL receiving.
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -1,6 +1,4 @@
 //! Synthetic size calculation
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]

 mod calculation;
 pub mod svg;
--- a/libs/tracing-utils/src/lib.rs
+++ b/libs/tracing-utils/src/lib.rs
@@ -32,8 +32,6 @@
 //!         .init();
 //! }
 //! ```
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]

 use opentelemetry::sdk::Resource;
 use opentelemetry::KeyValue;
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,7 +5,6 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
-arc-swap.workspace = true
 sentry.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
@@ -56,7 +55,6 @@ bytes.workspace = true
 criterion.workspace = true
 hex-literal.workspace = true
 camino-tempfile.workspace = true
-serde_assert.workspace = true

 [[bench]]
 name = "benchmarks"
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -1,8 +1,7 @@
 // For details about authentication see docs/authentication.md

-use arc_swap::ArcSwap;
 use serde;
-use std::{borrow::Cow, fmt::Display, fs, sync::Arc};
+use std::fs;

 use anyhow::Result;
 use camino::Utf8Path;
@@ -10,8 +9,9 @@ use jsonwebtoken::{
    decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation,
 };
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};

-use crate::{http::error::ApiError, id::TenantId};
+use crate::id::TenantId;

 /// Algorithm to use. We require EdDSA.
 const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
@@ -32,9 +32,11 @@ pub enum Scope {
 }

 /// JWT payload. See docs/authentication.md for the format
+#[serde_as]
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
 pub struct Claims {
    #[serde(default)]
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub tenant_id: Option<TenantId>,
    pub scope: Scope,
 }
@@ -45,106 +47,31 @@ impl Claims {
    }
 }

-pub struct SwappableJwtAuth(ArcSwap<JwtAuth>);
-
-impl SwappableJwtAuth {
-    pub fn new(jwt_auth: JwtAuth) -> Self {
-        SwappableJwtAuth(ArcSwap::new(Arc::new(jwt_auth)))
-    }
-    pub fn swap(&self, jwt_auth: JwtAuth) {
-        self.0.swap(Arc::new(jwt_auth));
-    }
-    pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
-        self.0.load().decode(token)
-    }
-}
-
-impl std::fmt::Debug for SwappableJwtAuth {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "Swappable({:?})", self.0.load())
-    }
-}
-
-#[derive(Clone, PartialEq, Eq, Hash, Debug)]
-pub struct AuthError(pub Cow<'static, str>);
-
-impl Display for AuthError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.0)
-    }
-}
-
-impl From<AuthError> for ApiError {
-    fn from(_value: AuthError) -> Self {
-        // Don't pass on the value of the AuthError as a precautionary measure.
-        // Being intentionally vague in public error communication hurts debugability
-        // but it is more secure.
-        ApiError::Forbidden("JWT authentication error".to_string())
-    }
-}
-
 pub struct JwtAuth {
-    decoding_keys: Vec<DecodingKey>,
+    decoding_key: DecodingKey,
    validation: Validation,
 }

 impl JwtAuth {
-    pub fn new(decoding_keys: Vec<DecodingKey>) -> Self {
+    pub fn new(decoding_key: DecodingKey) -> Self {
        let mut validation = Validation::default();
        validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM];
        // The default 'required_spec_claims' is 'exp'. But we don't want to require
        // expiration.
        validation.required_spec_claims = [].into();
        Self {
-            decoding_keys,
+            decoding_key,
            validation,
        }
    }

    pub fn from_key_path(key_path: &Utf8Path) -> Result<Self> {
-        let metadata = key_path.metadata()?;
-        let decoding_keys = if metadata.is_dir() {
-            let mut keys = Vec::new();
-            for entry in fs::read_dir(key_path)? {
-                let path = entry?.path();
-                if !path.is_file() {
-                    // Ignore directories (don't recurse)
-                    continue;
-                }
-                let public_key = fs::read(path)?;
-                keys.push(DecodingKey::from_ed_pem(&public_key)?);
-            }
-            keys
-        } else if metadata.is_file() {
-            let public_key = fs::read(key_path)?;
-            vec![DecodingKey::from_ed_pem(&public_key)?]
-        } else {
-            anyhow::bail!("path is neither a directory or a file")
-        };
-        if decoding_keys.is_empty() {
-            anyhow::bail!("Configured for JWT auth with zero decoding keys. All JWT gated requests would be rejected.");
-        }
-        Ok(Self::new(decoding_keys))
+        let public_key = fs::read(key_path)?;
+        Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?))
    }

-    /// Attempt to decode the token with the internal decoding keys.
-    ///
-    /// The function tries the stored decoding keys in succession,
-    /// and returns the first yielding a successful result.
-    /// If there is no working decoding key, it returns the last error.
-    pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
-        let mut res = None;
-        for decoding_key in &self.decoding_keys {
-            res = Some(decode(token, decoding_key, &self.validation));
-            if let Some(Ok(res)) = res {
-                return Ok(res);
-            }
-        }
-        if let Some(res) = res {
-            res.map_err(|e| AuthError(Cow::Owned(e.to_string())))
-        } else {
-            Err(AuthError(Cow::Borrowed("no JWT decoding keys configured")))
-        }
+    pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
+        Ok(decode(token, &self.decoding_key, &self.validation)?)
    }
 }

@@ -184,9 +111,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
 "#;

    #[test]
-    fn test_decode() {
+    fn test_decode() -> Result<(), anyhow::Error> {
        let expected_claims = Claims {
-            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
+            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
            scope: Scope::Tenant,
        };

@@ -205,24 +132,28 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";

        // Check it can be validated with the public key
-        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
-        let claims_from_token = auth.decode(encoded_eddsa).unwrap().claims;
+        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
+        let claims_from_token = auth.decode(encoded_eddsa)?.claims;
        assert_eq!(claims_from_token, expected_claims);
+
+        Ok(())
    }

    #[test]
-    fn test_encode() {
+    fn test_encode() -> Result<(), anyhow::Error> {
        let claims = Claims {
-            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
+            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
            scope: Scope::Tenant,
        };

-        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519).unwrap();
+        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519)?;

        // decode it back
-        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
-        let decoded = auth.decode(&encoded).unwrap();
+        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
+        let decoded = auth.decode(&encoded)?;

        assert_eq!(decoded.claims, claims);
+
+        Ok(())
    }
 }
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
 ///
 /// See docs/rfcs/025-generation-numbers.md for detail on how generation
 /// numbers are used.
-#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
+#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
 pub enum Generation {
    // Generations with this magic value will not add a suffix to S3 keys, and will not
    // be included in persisted index_part.json.  This value is only to be used
--- a/libs/utils/src/hex.rs
+++ b/libs/utils/src/hex.rs
@@ -1,41 +0,0 @@
-/// Useful type for asserting that expected bytes match reporting the bytes more readable
-/// array-syntax compatible hex bytes.
-///
-/// # Usage
-///
-/// ```
-/// use utils::Hex;
-///
-/// let actual = serialize_something();
-/// let expected = [0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64];
-///
-/// // the type implements PartialEq and on mismatch, both sides are printed in 16 wide multiline
-/// // output suffixed with an array style length for easier comparisons.
-/// assert_eq!(Hex(&actual), Hex(&expected));
-///
-/// // with `let expected = [0x68];` the error would had been:
-/// // assertion `left == right` failed
-/// //  left: [0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64; 11]
-/// // right: [0x68; 1]
-/// # fn serialize_something() -> Vec<u8> { "hello world".as_bytes().to_vec() }
-/// ```
-#[derive(PartialEq)]
-pub struct Hex<'a>(pub &'a [u8]);
-
-impl std::fmt::Debug for Hex<'_> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "[")?;
-        for (i, c) in self.0.chunks(16).enumerate() {
-            if i > 0 && !c.is_empty() {
-                writeln!(f, ", ")?;
-            }
-            for (j, b) in c.iter().enumerate() {
-                if j > 0 {
-                    write!(f, ", ")?;
-                }
-                write!(f, "0x{b:02x}")?;
-            }
-        }
-        write!(f, "; {}]", self.0.len())
-    }
-}
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -1,4 +1,4 @@
-use crate::auth::{AuthError, Claims, SwappableJwtAuth};
+use crate::auth::{Claims, JwtAuth};
 use crate::http::error::{api_error_handler, route_error_handler, ApiError};
 use anyhow::Context;
 use hyper::header::{HeaderName, AUTHORIZATION};
@@ -14,11 +14,6 @@ use tracing::{self, debug, info, info_span, warn, Instrument};
 use std::future::Future;
 use std::str::FromStr;

-use bytes::{Bytes, BytesMut};
-use std::io::Write as _;
-use tokio::sync::mpsc;
-use tokio_stream::wrappers::ReceiverStream;
-
 static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "libmetrics_metric_handler_requests_total",
@@ -151,89 +146,94 @@ impl Drop for RequestCancelled {
    }
 }

-/// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
-pub struct ChannelWriter {
-    buffer: BytesMut,
-    pub tx: mpsc::Sender<std::io::Result<Bytes>>,
-    written: usize,
-}
-
-impl ChannelWriter {
-    pub fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
-        assert_ne!(buf_len, 0);
-        ChannelWriter {
-            // split about half off the buffer from the start, because we flush depending on
-            // capacity. first flush will come sooner than without this, but now resizes will
-            // have better chance of picking up the "other" half. not guaranteed of course.
-            buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
-            tx,
-            written: 0,
-        }
-    }
-
-    pub fn flush0(&mut self) -> std::io::Result<usize> {
-        let n = self.buffer.len();
-        if n == 0 {
-            return Ok(0);
-        }
-
-        tracing::trace!(n, "flushing");
-        let ready = self.buffer.split().freeze();
-
-        // not ideal to call from blocking code to block_on, but we are sure that this
-        // operation does not spawn_blocking other tasks
-        let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
-            self.tx.send(Ok(ready)).await.map_err(|_| ())?;
-
-            // throttle sending to allow reuse of our buffer in `write`.
-            self.tx.reserve().await.map_err(|_| ())?;
-
-            // now the response task has picked up the buffer and hopefully started
-            // sending it to the client.
-            Ok(())
-        });
-        if res.is_err() {
-            return Err(std::io::ErrorKind::BrokenPipe.into());
-        }
-        self.written += n;
-        Ok(n)
-    }
-
-    pub fn flushed_bytes(&self) -> usize {
-        self.written
-    }
-}
-
-impl std::io::Write for ChannelWriter {
-    fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
-        let remaining = self.buffer.capacity() - self.buffer.len();
-
-        let out_of_space = remaining < buf.len();
-
-        let original_len = buf.len();
-
-        if out_of_space {
-            let can_still_fit = buf.len() - remaining;
-            self.buffer.extend_from_slice(&buf[..can_still_fit]);
-            buf = &buf[can_still_fit..];
-            self.flush0()?;
-        }
-
-        // assume that this will often under normal operation just move the pointer back to the
-        // beginning of allocation, because previous split off parts are already sent and
-        // dropped.
-        self.buffer.extend_from_slice(buf);
-        Ok(original_len)
-    }
-
-    fn flush(&mut self) -> std::io::Result<()> {
-        self.flush0().map(|_| ())
-    }
-}
-
 async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    use bytes::{Bytes, BytesMut};
+    use std::io::Write as _;
+    use tokio::sync::mpsc;
+    use tokio_stream::wrappers::ReceiverStream;
+
    SERVE_METRICS_COUNT.inc();

+    /// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
+    struct ChannelWriter {
+        buffer: BytesMut,
+        tx: mpsc::Sender<std::io::Result<Bytes>>,
+        written: usize,
+    }
+
+    impl ChannelWriter {
+        fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
+            assert_ne!(buf_len, 0);
+            ChannelWriter {
+                // split about half off the buffer from the start, because we flush depending on
+                // capacity. first flush will come sooner than without this, but now resizes will
+                // have better chance of picking up the "other" half. not guaranteed of course.
+                buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
+                tx,
+                written: 0,
+            }
+        }
+
+        fn flush0(&mut self) -> std::io::Result<usize> {
+            let n = self.buffer.len();
+            if n == 0 {
+                return Ok(0);
+            }
+
+            tracing::trace!(n, "flushing");
+            let ready = self.buffer.split().freeze();
+
+            // not ideal to call from blocking code to block_on, but we are sure that this
+            // operation does not spawn_blocking other tasks
+            let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
+                self.tx.send(Ok(ready)).await.map_err(|_| ())?;
+
+                // throttle sending to allow reuse of our buffer in `write`.
+                self.tx.reserve().await.map_err(|_| ())?;
+
+                // now the response task has picked up the buffer and hopefully started
+                // sending it to the client.
+                Ok(())
+            });
+            if res.is_err() {
+                return Err(std::io::ErrorKind::BrokenPipe.into());
+            }
+            self.written += n;
+            Ok(n)
+        }
+
+        fn flushed_bytes(&self) -> usize {
+            self.written
+        }
+    }
+
+    impl std::io::Write for ChannelWriter {
+        fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
+            let remaining = self.buffer.capacity() - self.buffer.len();
+
+            let out_of_space = remaining < buf.len();
+
+            let original_len = buf.len();
+
+            if out_of_space {
+                let can_still_fit = buf.len() - remaining;
+                self.buffer.extend_from_slice(&buf[..can_still_fit]);
+                buf = &buf[can_still_fit..];
+                self.flush0()?;
+            }
+
+            // assume that this will often under normal operation just move the pointer back to the
+            // beginning of allocation, because previous split off parts are already sent and
+            // dropped.
+            self.buffer.extend_from_slice(buf);
+            Ok(original_len)
+        }
+
+        fn flush(&mut self) -> std::io::Result<()> {
+            self.flush0().map(|_| ())
+        }
+    }
+
    let started_at = std::time::Instant::now();

    let (tx, rx) = mpsc::channel(1);
@@ -389,7 +389,7 @@ fn parse_token(header_value: &str) -> Result<&str, ApiError> {
 }

 pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
-    provide_auth: fn(&Request<Body>) -> Option<&SwappableJwtAuth>,
+    provide_auth: fn(&Request<Body>) -> Option<&JwtAuth>,
 ) -> Middleware<B, ApiError> {
    Middleware::pre(move |req| async move {
        if let Some(auth) = provide_auth(&req) {
@@ -400,11 +400,9 @@ pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
                    })?;
                    let token = parse_token(header_value)?;

-                    let data = auth.decode(token).map_err(|err| {
-                        warn!("Authentication error: {err}");
-                        // Rely on From<AuthError> for ApiError impl
-                        err
-                    })?;
+                    let data = auth
+                        .decode(token)
+                        .map_err(|_| ApiError::Unauthorized("malformed jwt token".to_string()))?;
                    req.set_context(data.claims);
                }
                None => {
@@ -452,11 +450,12 @@ where

 pub fn check_permission_with(
    req: &Request<Body>,
-    check_permission: impl Fn(&Claims) -> Result<(), AuthError>,
+    check_permission: impl Fn(&Claims) -> Result<(), anyhow::Error>,
 ) -> Result<(), ApiError> {
    match req.context::<Claims>() {
-        Some(claims) => Ok(check_permission(&claims)
-            .map_err(|_err| ApiError::Forbidden("JWT authentication error".to_string()))?),
+        Some(claims) => {
+            Ok(check_permission(&claims).map_err(|err| ApiError::Forbidden(err.to_string()))?)
+        }
        None => Ok(()), // claims is None because auth is disabled
    }
 }
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
 use std::borrow::Cow;
 use std::error::Error as StdError;
 use thiserror::Error;
-use tracing::{error, info, warn};
+use tracing::{error, info};

 #[derive(Debug, Error)]
 pub enum ApiError {
@@ -118,9 +118,6 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
    // Print a stack trace for Internal Server errors

    match api_error {
-        ApiError::Forbidden(_) | ApiError::Unauthorized(_) => {
-            warn!("Error processing HTTP request: {api_error:#}")
-        }
        ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
        ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
        ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -3,7 +3,6 @@ use std::{fmt, str::FromStr};
 use anyhow::Context;
 use hex::FromHex;
 use rand::Rng;
-use serde::de::Visitor;
 use serde::{Deserialize, Serialize};
 use thiserror::Error;

@@ -18,74 +17,12 @@ pub enum IdError {
 ///
 /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
 /// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
-#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
+///
+/// Use `#[serde_as(as = "DisplayFromStr")]` to (de)serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`.
+/// Check the `serde_with::serde_as` documentation for options for more complex types.
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
 struct Id([u8; 16]);

-impl Serialize for Id {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if serializer.is_human_readable() {
-            serializer.collect_str(self)
-        } else {
-            self.0.serialize(serializer)
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for Id {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        struct IdVisitor {
-            is_human_readable_deserializer: bool,
-        }
-
-        impl<'de> Visitor<'de> for IdVisitor {
-            type Value = Id;
-
-            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
-                if self.is_human_readable_deserializer {
-                    formatter.write_str("value in form of hex string")
-                } else {
-                    formatter.write_str("value in form of integer array([u8; 16])")
-                }
-            }
-
-            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::SeqAccess<'de>,
-            {
-                let s = serde::de::value::SeqAccessDeserializer::new(seq);
-                let id: [u8; 16] = Deserialize::deserialize(s)?;
-                Ok(Id::from(id))
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                Id::from_str(v).map_err(E::custom)
-            }
-        }
-
-        if deserializer.is_human_readable() {
-            deserializer.deserialize_str(IdVisitor {
-                is_human_readable_deserializer: true,
-            })
-        } else {
-            deserializer.deserialize_tuple(
-                16,
-                IdVisitor {
-                    is_human_readable_deserializer: false,
-                },
-            )
-        }
-    }
-}
-
 impl Id {
    pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id {
        let mut arr = [0u8; 16];
@@ -120,8 +57,6 @@ impl Id {
            chunk[0] = HEX[((b >> 4) & 0xf) as usize];
            chunk[1] = HEX[(b & 0xf) as usize];
        }
-
-        // SAFETY: vec constructed out of `HEX`, it can only be ascii
        unsafe { String::from_utf8_unchecked(buf) }
    }
 }
@@ -373,112 +308,3 @@ impl fmt::Display for NodeId {
        write!(f, "{}", self.0)
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use serde_assert::{Deserializer, Serializer, Token, Tokens};
-
-    use crate::bin_ser::BeSer;
-
-    use super::*;
-
-    #[test]
-    fn test_id_serde_non_human_readable() {
-        let original_id = Id([
-            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
-        ]);
-        let expected_tokens = Tokens(vec![
-            Token::Tuple { len: 16 },
-            Token::U8(173),
-            Token::U8(80),
-            Token::U8(132),
-            Token::U8(115),
-            Token::U8(129),
-            Token::U8(226),
-            Token::U8(72),
-            Token::U8(254),
-            Token::U8(170),
-            Token::U8(201),
-            Token::U8(135),
-            Token::U8(108),
-            Token::U8(199),
-            Token::U8(26),
-            Token::U8(228),
-            Token::U8(24),
-            Token::TupleEnd,
-        ]);
-
-        let serializer = Serializer::builder().is_human_readable(false).build();
-        let serialized_tokens = original_id.serialize(&serializer).unwrap();
-        assert_eq!(serialized_tokens, expected_tokens);
-
-        let mut deserializer = Deserializer::builder()
-            .is_human_readable(false)
-            .tokens(serialized_tokens)
-            .build();
-        let deserialized_id = Id::deserialize(&mut deserializer).unwrap();
-        assert_eq!(deserialized_id, original_id);
-    }
-
-    #[test]
-    fn test_id_serde_human_readable() {
-        let original_id = Id([
-            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
-        ]);
-        let expected_tokens = Tokens(vec![Token::Str(String::from(
-            "ad50847381e248feaac9876cc71ae418",
-        ))]);
-
-        let serializer = Serializer::builder().is_human_readable(true).build();
-        let serialized_tokens = original_id.serialize(&serializer).unwrap();
-        assert_eq!(serialized_tokens, expected_tokens);
-
-        let mut deserializer = Deserializer::builder()
-            .is_human_readable(true)
-            .tokens(Tokens(vec![Token::Str(String::from(
-                "ad50847381e248feaac9876cc71ae418",
-            ))]))
-            .build();
-        assert_eq!(Id::deserialize(&mut deserializer).unwrap(), original_id);
-    }
-
-    macro_rules! roundtrip_type {
-        ($type:ty, $expected_bytes:expr) => {{
-            let expected_bytes: [u8; 16] = $expected_bytes;
-            let original_id = <$type>::from(expected_bytes);
-
-            let ser_bytes = original_id.ser().unwrap();
-            assert_eq!(ser_bytes, expected_bytes);
-
-            let des_id = <$type>::des(&ser_bytes).unwrap();
-            assert_eq!(des_id, original_id);
-        }};
-    }
-
-    #[test]
-    fn test_id_bincode_serde() {
-        let expected_bytes = [
-            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
-        ];
-
-        roundtrip_type!(Id, expected_bytes);
-    }
-
-    #[test]
-    fn test_tenant_id_bincode_serde() {
-        let expected_bytes = [
-            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
-        ];
-
-        roundtrip_type!(TenantId, expected_bytes);
-    }
-
-    #[test]
-    fn test_timeline_id_bincode_serde() {
-        let expected_bytes = [
-            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
-        ];
-
-        roundtrip_type!(TimelineId, expected_bytes);
-    }
-}
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -1,6 +1,5 @@
 //! `utils` is intended to be a place to put code that is shared
 //! between other crates in this repository.
-#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod backoff;

@@ -25,10 +24,6 @@ pub mod auth;

 // utility functions and helper traits for unified unique id generation/serialization etc.
 pub mod id;
-
-mod hex;
-pub use hex::Hex;
-
 // http endpoint utils
 pub mod http;

@@ -78,9 +73,6 @@ pub mod completion;
 /// Reporting utilities
 pub mod error;

-/// async timeout helper
-pub mod timeout;
-
 pub mod sync;

 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -1,7 +1,7 @@
 #![warn(missing_docs)]

 use camino::Utf8Path;
-use serde::{de::Visitor, Deserialize, Serialize};
+use serde::{Deserialize, Serialize};
 use std::fmt;
 use std::ops::{Add, AddAssign};
 use std::str::FromStr;
@@ -13,114 +13,10 @@ use crate::seqwait::MonotonicCounter;
 pub const XLOG_BLCKSZ: u32 = 8192;

 /// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
-#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)]
+#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Serialize, Deserialize)]
+#[serde(transparent)]
 pub struct Lsn(pub u64);

-impl Serialize for Lsn {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if serializer.is_human_readable() {
-            serializer.collect_str(self)
-        } else {
-            self.0.serialize(serializer)
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for Lsn {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        struct LsnVisitor {
-            is_human_readable_deserializer: bool,
-        }
-
-        impl<'de> Visitor<'de> for LsnVisitor {
-            type Value = Lsn;
-
-            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
-                if self.is_human_readable_deserializer {
-                    formatter.write_str(
-                        "value in form of hex string({upper_u32_hex}/{lower_u32_hex}) representing u64 integer",
-                    )
-                } else {
-                    formatter.write_str("value in form of integer(u64)")
-                }
-            }
-
-            fn visit_u64<E>(self, v: u64) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                Ok(Lsn(v))
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                Lsn::from_str(v).map_err(|e| E::custom(e))
-            }
-        }
-
-        if deserializer.is_human_readable() {
-            deserializer.deserialize_str(LsnVisitor {
-                is_human_readable_deserializer: true,
-            })
-        } else {
-            deserializer.deserialize_u64(LsnVisitor {
-                is_human_readable_deserializer: false,
-            })
-        }
-    }
-}
-
-/// Allows (de)serialization of an `Lsn` always as `u64`.
-///
-/// ### Example
-///
-/// ```rust
-/// # use serde::{Serialize, Deserialize};
-/// use utils::lsn::Lsn;
-///
-/// #[derive(PartialEq, Serialize, Deserialize, Debug)]
-/// struct Foo {
-///   #[serde(with = "utils::lsn::serde_as_u64")]
-///   always_u64: Lsn,
-/// }
-///
-/// let orig = Foo { always_u64: Lsn(1234) };
-///
-/// let res = serde_json::to_string(&orig).unwrap();
-/// assert_eq!(res, r#"{"always_u64":1234}"#);
-///
-/// let foo = serde_json::from_str::<Foo>(&res).unwrap();
-/// assert_eq!(foo, orig);
-/// ```
-///
-pub mod serde_as_u64 {
-    use super::Lsn;
-
-    /// Serializes the Lsn as u64 disregarding the human readability of the format.
-    ///
-    /// Meant to be used via `#[serde(with = "...")]` or `#[serde(serialize_with = "...")]`.
-    pub fn serialize<S: serde::Serializer>(lsn: &Lsn, serializer: S) -> Result<S::Ok, S::Error> {
-        use serde::Serialize;
-        lsn.0.serialize(serializer)
-    }
-
-    /// Deserializes the Lsn as u64 disregarding the human readability of the format.
-    ///
-    /// Meant to be used via `#[serde(with = "...")]` or `#[serde(deserialize_with = "...")]`.
-    pub fn deserialize<'de, D: serde::Deserializer<'de>>(deserializer: D) -> Result<Lsn, D::Error> {
-        use serde::Deserialize;
-        u64::deserialize(deserializer).map(Lsn)
-    }
-}
-
 /// We tried to parse an LSN from a string, but failed
 #[derive(Debug, PartialEq, Eq, thiserror::Error)]
 #[error("LsnParseError")]
@@ -368,13 +264,8 @@ impl MonotonicCounter<Lsn> for RecordLsn {

 #[cfg(test)]
 mod tests {
-    use crate::bin_ser::BeSer;
-
    use super::*;

-    use serde::ser::Serialize;
-    use serde_assert::{Deserializer, Serializer, Token, Tokens};
-
    #[test]
    fn test_lsn_strings() {
        assert_eq!("12345678/AAAA5555".parse(), Ok(Lsn(0x12345678AAAA5555)));
@@ -450,95 +341,4 @@ mod tests {
        assert_eq!(lsn.fetch_max(Lsn(6000)), Lsn(5678));
        assert_eq!(lsn.fetch_max(Lsn(5000)), Lsn(6000));
    }
-
-    #[test]
-    fn test_lsn_serde() {
-        let original_lsn = Lsn(0x0123456789abcdef);
-        let expected_readable_tokens = Tokens(vec![Token::U64(0x0123456789abcdef)]);
-        let expected_non_readable_tokens =
-            Tokens(vec![Token::Str(String::from("1234567/89ABCDEF"))]);
-
-        // Testing human_readable ser/de
-        let serializer = Serializer::builder().is_human_readable(false).build();
-        let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
-        assert_eq!(readable_ser_tokens, expected_readable_tokens);
-
-        let mut deserializer = Deserializer::builder()
-            .is_human_readable(false)
-            .tokens(readable_ser_tokens)
-            .build();
-        let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
-        assert_eq!(des_lsn, original_lsn);
-
-        // Testing NON human_readable ser/de
-        let serializer = Serializer::builder().is_human_readable(true).build();
-        let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
-        assert_eq!(non_readable_ser_tokens, expected_non_readable_tokens);
-
-        let mut deserializer = Deserializer::builder()
-            .is_human_readable(true)
-            .tokens(non_readable_ser_tokens)
-            .build();
-        let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
-        assert_eq!(des_lsn, original_lsn);
-
-        // Testing mismatching ser/de
-        let serializer = Serializer::builder().is_human_readable(false).build();
-        let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
-
-        let mut deserializer = Deserializer::builder()
-            .is_human_readable(true)
-            .tokens(non_readable_ser_tokens)
-            .build();
-        Lsn::deserialize(&mut deserializer).unwrap_err();
-
-        let serializer = Serializer::builder().is_human_readable(true).build();
-        let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
-
-        let mut deserializer = Deserializer::builder()
-            .is_human_readable(false)
-            .tokens(readable_ser_tokens)
-            .build();
-        Lsn::deserialize(&mut deserializer).unwrap_err();
-    }
-
-    #[test]
-    fn test_lsn_ensure_roundtrip() {
-        let original_lsn = Lsn(0xaaaabbbb);
-
-        let serializer = Serializer::builder().is_human_readable(false).build();
-        let ser_tokens = original_lsn.serialize(&serializer).unwrap();
-
-        let mut deserializer = Deserializer::builder()
-            .is_human_readable(false)
-            .tokens(ser_tokens)
-            .build();
-
-        let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
-        assert_eq!(des_lsn, original_lsn);
-    }
-
-    #[test]
-    fn test_lsn_bincode_serde() {
-        let lsn = Lsn(0x0123456789abcdef);
-        let expected_bytes = [0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef];
-
-        let ser_bytes = lsn.ser().unwrap();
-        assert_eq!(ser_bytes, expected_bytes);
-
-        let des_lsn = Lsn::des(&ser_bytes).unwrap();
-        assert_eq!(des_lsn, lsn);
-    }
-
-    #[test]
-    fn test_lsn_bincode_ensure_roundtrip() {
-        let original_lsn = Lsn(0x01_02_03_04_05_06_07_08);
-        let expected_bytes = vec![0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08];
-
-        let ser_bytes = original_lsn.ser().unwrap();
-        assert_eq!(ser_bytes, expected_bytes);
-
-        let des_lsn = Lsn::des(&ser_bytes).unwrap();
-        assert_eq!(des_lsn, original_lsn);
-    }
 }
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -3,6 +3,7 @@ use std::time::{Duration, SystemTime};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use pq_proto::{read_cstr, PG_EPOCH};
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use tracing::{trace, warn};

 use crate::lsn::Lsn;
@@ -14,17 +15,21 @@ use crate::lsn::Lsn;
 ///
 /// serde Serialize is used only for human readable dump to json (e.g. in
 /// safekeepers debug_dump).
+#[serde_as]
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct PageserverFeedback {
    /// Last known size of the timeline. Used to enforce timeline size limit.
    pub current_timeline_size: u64,
    /// LSN last received and ingested by the pageserver. Controls backpressure.
+    #[serde_as(as = "DisplayFromStr")]
    pub last_received_lsn: Lsn,
    /// LSN up to which data is persisted by the pageserver to its local disc.
    /// Controls backpressure.
+    #[serde_as(as = "DisplayFromStr")]
    pub disk_consistent_lsn: Lsn,
    /// LSN up to which data is persisted by the pageserver on s3; safekeepers
    /// consider WAL before it can be removed.
+    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn: Lsn,
    // Serialize with RFC3339 format.
    #[serde(with = "serde_systemtime")]
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -125,9 +125,6 @@ where
            // Wake everyone with an error.
            let mut internal = self.internal.lock().unwrap();

-            // Block any future waiters from starting
-            internal.shutdown = true;
-
            // This will steal the entire waiters map.
            // When we drop it all waiters will be woken.
            mem::take(&mut internal.waiters)
--- a/libs/utils/src/shutdown.rs
+++ b/libs/utils/src/shutdown.rs
@@ -1,7 +1,6 @@
 /// Immediately terminate the calling process without calling
 /// atexit callbacks, C runtime destructors etc. We mainly use
 /// this to protect coverage data from concurrent writes.
-pub fn exit_now(code: u8) -> ! {
-    // SAFETY: exiting is safe, the ffi is not safe
+pub fn exit_now(code: u8) {
    unsafe { nix::libc::_exit(code as _) };
 }
--- a/libs/utils/src/sync.rs
+++ b/libs/utils/src/sync.rs
@@ -1,3 +1 @@
 pub mod heavier_once_cell;
-
-pub mod gate;
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -1,158 +0,0 @@
-use std::{sync::Arc, time::Duration};
-
-/// Gates are a concurrency helper, primarily used for implementing safe shutdown.
-///
-/// Users of a resource call `enter()` to acquire a GateGuard, and the owner of
-/// the resource calls `close()` when they want to ensure that all holders of guards
-/// have released them, and that no future guards will be issued.
-pub struct Gate {
-    /// Each caller of enter() takes one unit from the semaphore. In close(), we
-    /// take all the units to ensure all GateGuards are destroyed.
-    sem: Arc<tokio::sync::Semaphore>,
-
-    /// For observability only: a name that will be used to log warnings if a particular
-    /// gate is holding up shutdown
-    name: String,
-}
-
-/// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
-/// not complete.
-#[derive(Debug)]
-pub struct GateGuard(tokio::sync::OwnedSemaphorePermit);
-
-/// Observability helper: every `warn_period`, emit a log warning that we're still waiting on this gate
-async fn warn_if_stuck<Fut: std::future::Future>(
-    fut: Fut,
-    name: &str,
-    warn_period: std::time::Duration,
-) -> <Fut as std::future::Future>::Output {
-    let started = std::time::Instant::now();
-
-    let mut fut = std::pin::pin!(fut);
-
-    loop {
-        match tokio::time::timeout(warn_period, &mut fut).await {
-            Ok(ret) => return ret,
-            Err(_) => {
-                tracing::warn!(
-                    gate = name,
-                    elapsed_ms = started.elapsed().as_millis(),
-                    "still waiting, taking longer than expected..."
-                );
-            }
-        }
-    }
-}
-
-#[derive(Debug)]
-pub enum GateError {
-    GateClosed,
-}
-
-impl Gate {
-    const MAX_UNITS: u32 = u32::MAX;
-
-    pub fn new(name: String) -> Self {
-        Self {
-            sem: Arc::new(tokio::sync::Semaphore::new(Self::MAX_UNITS as usize)),
-            name,
-        }
-    }
-
-    /// Acquire a guard that will prevent close() calls from completing. If close()
-    /// was already called, this will return an error which should be interpreted
-    /// as "shutting down".
-    ///
-    /// This function would typically be used from e.g. request handlers. While holding
-    /// the guard returned from this function, it is important to respect a CancellationToken
-    /// to avoid blocking close() indefinitely: typically types that contain a Gate will
-    /// also contain a CancellationToken.
-    pub fn enter(&self) -> Result<GateGuard, GateError> {
-        self.sem
-            .clone()
-            .try_acquire_owned()
-            .map(GateGuard)
-            .map_err(|_| GateError::GateClosed)
-    }
-
-    /// Types with a shutdown() method and a gate should call this method at the
-    /// end of shutdown, to ensure that all GateGuard holders are done.
-    ///
-    /// This will wait for all guards to be destroyed.  For this to complete promptly, it is
-    /// important that the holders of such guards are respecting a CancellationToken which has
-    /// been cancelled before entering this function.
-    pub async fn close(&self) {
-        warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
-    }
-
-    /// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish.  This
-    /// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking
-    /// the CancellationToken on such types is analogous to "Did shutdown start?"
-    pub fn close_complete(&self) -> bool {
-        self.sem.is_closed()
-    }
-
-    async fn do_close(&self) {
-        tracing::debug!(gate = self.name, "Closing Gate...");
-        match self.sem.acquire_many(Self::MAX_UNITS).await {
-            Ok(_units) => {
-                // While holding all units, close the semaphore.  All subsequent calls to enter() will fail.
-                self.sem.close();
-            }
-            Err(_) => {
-                // Semaphore closed: we are the only function that can do this, so it indicates a double-call.
-                // This is legal.  Timeline::shutdown for example is not protected from being called more than
-                // once.
-                tracing::debug!(gate = self.name, "Double close")
-            }
-        }
-        tracing::debug!(gate = self.name, "Closed Gate.")
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use futures::FutureExt;
-
-    use super::*;
-
-    #[tokio::test]
-    async fn test_idle_gate() {
-        // Having taken no gates, we should not be blocked in close
-        let gate = Gate::new("test".to_string());
-        gate.close().await;
-
-        // If a guard is dropped before entering, close should not be blocked
-        let gate = Gate::new("test".to_string());
-        let guard = gate.enter().unwrap();
-        drop(guard);
-        gate.close().await;
-
-        // Entering a closed guard fails
-        gate.enter().expect_err("enter should fail after close");
-    }
-
-    #[tokio::test]
-    async fn test_busy_gate() {
-        let gate = Gate::new("test".to_string());
-
-        let guard = gate.enter().unwrap();
-
-        let mut close_fut = std::pin::pin!(gate.close());
-
-        // Close should be blocked
-        assert!(close_fut.as_mut().now_or_never().is_none());
-
-        // Attempting to enter() should fail, even though close isn't done yet.
-        gate.enter()
-            .expect_err("enter should fail after entering close");
-
-        drop(guard);
-
-        // Guard is gone, close should finish
-        assert!(close_fut.as_mut().now_or_never().is_some());
-
-        // Attempting to enter() is still forbidden
-        gate.enter().expect_err("enter should fail finishing close");
-    }
-}
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -1,7 +1,4 @@
-use std::sync::{
-    atomic::{AtomicUsize, Ordering},
-    Arc, Mutex, MutexGuard,
-};
+use std::sync::{Arc, Mutex, MutexGuard};
 use tokio::sync::Semaphore;

 /// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
@@ -13,7 +10,6 @@ use tokio::sync::Semaphore;
 /// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
 pub struct OnceCell<T> {
    inner: Mutex<Inner<T>>,
-    initializers: AtomicUsize,
 }

 impl<T> Default for OnceCell<T> {
@@ -21,7 +17,6 @@ impl<T> Default for OnceCell<T> {
    fn default() -> Self {
        Self {
            inner: Default::default(),
-            initializers: AtomicUsize::new(0),
        }
    }
 }
@@ -54,7 +49,6 @@ impl<T> OnceCell<T> {
                init_semaphore: Arc::new(sem),
                value: Some(value),
            }),
-            initializers: AtomicUsize::new(0),
        }
    }

@@ -66,8 +60,8 @@ impl<T> OnceCell<T> {
    /// Initialization is panic-safe and cancellation-safe.
    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
    where
-        F: FnOnce(InitPermit) -> Fut,
-        Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
+        F: FnOnce() -> Fut,
+        Fut: std::future::Future<Output = Result<T, E>>,
    {
        let sem = {
            let guard = self.inner.lock().unwrap();
@@ -77,61 +71,29 @@ impl<T> OnceCell<T> {
            guard.init_semaphore.clone()
        };

-        let permit = {
-            // increment the count for the duration of queued
-            let _guard = CountWaitingInitializers::start(self);
-            sem.acquire_owned().await
-        };
+        let permit = sem.acquire_owned().await;
+        if permit.is_err() {
+            let guard = self.inner.lock().unwrap();
+            assert!(
+                guard.value.is_some(),
+                "semaphore got closed, must be initialized"
+            );
+            return Ok(Guard(guard));
+        } else {
+            // now we try
+            let value = factory().await?;

-        match permit {
-            Ok(permit) => {
-                let permit = InitPermit(permit);
-                let (value, _permit) = factory(permit).await?;
-
-                let guard = self.inner.lock().unwrap();
-
-                Ok(Self::set0(value, guard))
-            }
-            Err(_closed) => {
-                let guard = self.inner.lock().unwrap();
-                assert!(
-                    guard.value.is_some(),
-                    "semaphore got closed, must be initialized"
-                );
-                return Ok(Guard(guard));
-            }
+            let mut guard = self.inner.lock().unwrap();
+            assert!(
+                guard.value.is_none(),
+                "we won permit, must not be initialized"
+            );
+            guard.value = Some(value);
+            guard.init_semaphore.close();
+            Ok(Guard(guard))
        }
    }

-    /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
-    /// to complete initializing the inner value.
-    ///
-    /// # Panics
-    ///
-    /// If the inner has already been initialized.
-    pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
-        let guard = self.inner.lock().unwrap();
-
-        // cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot
-        // give more permits right now.
-        if guard.init_semaphore.try_acquire().is_ok() {
-            drop(guard);
-            panic!("permit is of wrong origin");
-        }
-
-        Self::set0(value, guard)
-    }
-
-    fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
-        if guard.value.is_some() {
-            drop(guard);
-            unreachable!("we won permit, must not be initialized");
-        }
-        guard.value = Some(value);
-        guard.init_semaphore.close();
-        Guard(guard)
-    }
-
    /// Returns a guard to an existing initialized value, if any.
    pub fn get(&self) -> Option<Guard<'_, T>> {
        let guard = self.inner.lock().unwrap();
@@ -141,28 +103,6 @@ impl<T> OnceCell<T> {
            None
        }
    }
-
-    /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
-    pub fn initializer_count(&self) -> usize {
-        self.initializers.load(Ordering::Relaxed)
-    }
-}
-
-/// DropGuard counter for queued tasks waiting to initialize, mainly accessible for the
-/// initializing task for example at the end of initialization.
-struct CountWaitingInitializers<'a, T>(&'a OnceCell<T>);
-
-impl<'a, T> CountWaitingInitializers<'a, T> {
-    fn start(target: &'a OnceCell<T>) -> Self {
-        target.initializers.fetch_add(1, Ordering::Relaxed);
-        CountWaitingInitializers(target)
-    }
-}
-
-impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
-    fn drop(&mut self) {
-        self.0.initializers.fetch_sub(1, Ordering::Relaxed);
-    }
 }

 /// Uninteresting guard object to allow short-lived access to inspect or clone the held,
@@ -195,7 +135,7 @@ impl<'a, T> Guard<'a, T> {
    ///
    /// The permit will be on a semaphore part of the new internal value, and any following
    /// [`OnceCell::get_or_init`] will wait on it to complete.
-    pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
+    pub fn take_and_deinit(&mut self) -> (T, tokio::sync::OwnedSemaphorePermit) {
        let mut swapped = Inner::default();
        let permit = swapped
            .init_semaphore
@@ -205,14 +145,11 @@ impl<'a, T> Guard<'a, T> {
        std::mem::swap(&mut *self.0, &mut swapped);
        swapped
            .value
-            .map(|v| (v, InitPermit(permit)))
+            .map(|v| (v, permit))
            .expect("guard is not created unless value has been initialized")
    }
 }

-/// Type held by OnceCell (de)initializing task.
-pub struct InitPermit(tokio::sync::OwnedSemaphorePermit);
-
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -248,11 +185,11 @@ mod tests {
                    barrier.wait().await;
                    let won = {
                        let g = cell
-                            .get_or_init(|permit| {
+                            .get_or_init(|| {
                                counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
                                async {
                                    counters.future_polled.fetch_add(1, Ordering::Relaxed);
-                                    Ok::<_, Infallible>((i, permit))
+                                    Ok::<_, Infallible>(i)
                                }
                            })
                            .await
@@ -306,7 +243,7 @@ mod tests {
        deinitialization_started.wait().await;

        let started_at = tokio::time::Instant::now();
-        cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
+        cell.get_or_init(|| async { Ok::<_, Infallible>(reinit) })
            .await
            .unwrap();

@@ -321,32 +258,18 @@ mod tests {
        assert_eq!(*cell.get().unwrap(), reinit);
    }

-    #[test]
-    fn reinit_with_deinit_permit() {
-        let cell = Arc::new(OnceCell::new(42));
-
-        let (mol, permit) = cell.get().unwrap().take_and_deinit();
-        cell.set(5, permit);
-        assert_eq!(*cell.get().unwrap(), 5);
-
-        let (five, permit) = cell.get().unwrap().take_and_deinit();
-        assert_eq!(5, five);
-        cell.set(mol, permit);
-        assert_eq!(*cell.get().unwrap(), 42);
-    }
-
    #[tokio::test]
    async fn initialization_attemptable_until_ok() {
        let cell = OnceCell::default();

        for _ in 0..10 {
-            cell.get_or_init(|_permit| async { Err("whatever error") })
+            cell.get_or_init(|| async { Err("whatever error") })
                .await
                .unwrap_err();
        }

        let g = cell
-            .get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
+            .get_or_init(|| async { Ok::<_, Infallible>("finally success") })
            .await
            .unwrap();
        assert_eq!(*g, "finally success");
@@ -358,11 +281,11 @@ mod tests {

        let barrier = tokio::sync::Barrier::new(2);

-        let initializer = cell.get_or_init(|permit| async {
+        let initializer = cell.get_or_init(|| async {
            barrier.wait().await;
            futures::future::pending::<()>().await;

-            Ok::<_, Infallible>(("never reached", permit))
+            Ok::<_, Infallible>("never reached")
        });

        tokio::select! {
@@ -375,7 +298,7 @@ mod tests {
        assert!(cell.get().is_none());

        let g = cell
-            .get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
+            .get_or_init(|| async { Ok::<_, Infallible>("now initialized") })
            .await
            .unwrap();
        assert_eq!(*g, "now initialized");
--- a/libs/utils/src/timeout.rs
+++ b/libs/utils/src/timeout.rs
@@ -1,37 +0,0 @@
-use std::time::Duration;
-
-use tokio_util::sync::CancellationToken;
-
-pub enum TimeoutCancellableError {
-    Timeout,
-    Cancelled,
-}
-
-/// Wrap [`tokio::time::timeout`] with a CancellationToken.
-///
-/// This wrapper is appropriate for any long running operation in a task
-/// that ought to respect a CancellationToken (which means most tasks).
-///
-/// The only time you should use a bare tokio::timeout is when the future `F`
-/// itself respects a CancellationToken: otherwise, always use this wrapper
-/// with your CancellationToken to ensure that your task does not hold up
-/// graceful shutdown.
-pub async fn timeout_cancellable<F>(
-    duration: Duration,
-    cancel: &CancellationToken,
-    future: F,
-) -> Result<F::Output, TimeoutCancellableError>
-where
-    F: std::future::Future,
-{
-    tokio::select!(
-        r = tokio::time::timeout(duration, future) => {
-            r.map_err(|_| TimeoutCancellableError::Timeout)
-
-        },
-        _ = cancel.cancelled() => {
-            Err(TimeoutCancellableError::Cancelled)
-
-        }
-    )
-}
--- a/libs/vm_monitor/Cargo.toml
+++ b/libs/vm_monitor/Cargo.toml
@@ -19,12 +19,13 @@ inotify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 sysinfo.workspace = true
-tokio = { workspace = true, features = ["rt-multi-thread"] }
+tokio.workspace = true
 tokio-postgres.workspace = true
 tokio-stream.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-subscriber.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }

 [target.'cfg(target_os = "linux")'.dependencies]
 cgroups-rs = "0.3.3"
--- a/libs/vm_monitor/src/filecache.rs
+++ b/libs/vm_monitor/src/filecache.rs
@@ -21,6 +21,11 @@ pub struct FileCacheState {

 #[derive(Debug)]
 pub struct FileCacheConfig {
+    /// Whether the file cache is *actually* stored in memory (e.g. by writing to
+    /// a tmpfs or shmem file). If true, the size of the file cache will be counted against the
+    /// memory available for the cgroup.
+    pub(crate) in_memory: bool,
+
    /// The size of the file cache, in terms of the size of the resource it consumes
    /// (currently: only memory)
    ///
@@ -54,9 +59,22 @@ pub struct FileCacheConfig {
    spread_factor: f64,
 }

-impl Default for FileCacheConfig {
-    fn default() -> Self {
+impl FileCacheConfig {
+    pub fn default_in_memory() -> Self {
        Self {
+            in_memory: true,
+            // 75 %
+            resource_multiplier: 0.75,
+            // 640 MiB; (512 + 128)
+            min_remaining_after_cache: NonZeroU64::new(640 * MiB).unwrap(),
+            // ensure any increase in file cache size is split 90-10 with 10% to other memory
+            spread_factor: 0.1,
+        }
+    }
+
+    pub fn default_on_disk() -> Self {
+        Self {
+            in_memory: false,
            resource_multiplier: 0.75,
            // 256 MiB - lower than when in memory because overcommitting is safe; if we don't have
            // memory, the kernel will just evict from its page cache, rather than e.g. killing
@@ -65,9 +83,7 @@ impl Default for FileCacheConfig {
            spread_factor: 0.1,
        }
    }
-}

-impl FileCacheConfig {
    /// Make sure fields of the config are consistent.
    pub fn validate(&self) -> anyhow::Result<()> {
        // Single field validity
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -1,5 +1,3 @@
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
 #![cfg(target_os = "linux")]

 use anyhow::Context;
@@ -41,6 +39,16 @@ pub struct Args {
    #[arg(short, long)]
    pub pgconnstr: Option<String>,

+    /// Flag to signal that the Postgres file cache is on disk (i.e. not in memory aside from the
+    /// kernel's page cache), and therefore should not count against available memory.
+    //
+    // NB: Ideally this flag would directly refer to whether the file cache is in memory (rather
+    // than a roundabout way, via whether it's on disk), but in order to be backwards compatible
+    // during the switch away from an in-memory file cache, we had to default to the previous
+    // behavior.
+    #[arg(long)]
+    pub file_cache_on_disk: bool,
+
    /// The address we should listen on for connection requests. For the
    /// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
    #[arg(short, long)]
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -156,7 +156,10 @@ impl Runner {
        // memory limits.
        if let Some(connstr) = &args.pgconnstr {
            info!("initializing file cache");
-            let config = FileCacheConfig::default();
+            let config = match args.file_cache_on_disk {
+                true => FileCacheConfig::default_on_disk(),
+                false => FileCacheConfig::default_in_memory(),
+            };

            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
                .await
@@ -184,7 +187,10 @@ impl Runner {
                info!("file cache size actually got set to {actual_size}")
            }

-            file_cache_disk_size = actual_size;
+            if args.file_cache_on_disk {
+                file_cache_disk_size = actual_size;
+            }
+
            state.filecache = Some(file_cache);
        }

@@ -233,11 +239,17 @@ impl Runner {

        let requested_mem = target.mem;
        let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
-        let expected_file_cache_size = self
+        let (expected_file_cache_size, expected_file_cache_disk_size) = self
            .filecache
            .as_ref()
-            .map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
-            .unwrap_or(0);
+            .map(|file_cache| {
+                let size = file_cache.config.calculate_cache_size(usable_system_memory);
+                match file_cache.config.in_memory {
+                    true => (size, 0),
+                    false => (size, size),
+                }
+            })
+            .unwrap_or((0, 0));
        if let Some(cgroup) = &self.cgroup {
            let (last_time, last_history) = *cgroup.watcher.borrow();

@@ -261,7 +273,7 @@ impl Runner {

            let new_threshold = self
                .config
-                .cgroup_threshold(usable_system_memory, expected_file_cache_size);
+                .cgroup_threshold(usable_system_memory, expected_file_cache_disk_size);

            let current = last_history.avg_non_reclaimable;

@@ -288,10 +300,13 @@ impl Runner {
                .set_file_cache_size(expected_file_cache_size)
                .await
                .context("failed to set file cache size")?;
-            file_cache_disk_size = actual_usage;
+            if !file_cache.config.in_memory {
+                file_cache_disk_size = actual_usage;
+            }
            let message = format!(
-                "set file cache size to {} MiB",
+                "set file cache size to {} MiB (in memory = {})",
                bytes_to_mebibytes(actual_usage),
+                file_cache.config.in_memory,
            );
            info!("downscale: {message}");
            status.push(message);
@@ -342,7 +357,9 @@ impl Runner {
                .set_file_cache_size(expected_usage)
                .await
                .context("failed to set file cache size")?;
-            file_cache_disk_size = actual_usage;
+            if !file_cache.config.in_memory {
+                file_cache_disk_size = actual_usage;
+            }

            if actual_usage != expected_usage {
                warn!(
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -188,7 +188,6 @@ extern "C" fn recovery_download(
    }
 }

-#[allow(clippy::unnecessary_cast)]
 extern "C" fn wal_read(
    sk: *mut Safekeeper,
    buf: *mut ::std::os::raw::c_char,
@@ -422,7 +421,6 @@ impl std::fmt::Display for Level {
 }

 /// Take ownership of `Vec<u8>` from StringInfoData.
-#[allow(clippy::unnecessary_cast)]
 pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option<Vec<u8>> {
    if pg.data.is_null() {
        return None;
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -186,7 +186,7 @@ impl Wrapper {
            .unwrap()
            .into_bytes_with_nul();
        assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
-        let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut std::ffi::c_char;
+        let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut i8;

        let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;

--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -88,6 +88,10 @@ criterion.workspace = true
 hex-literal.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }

+[[bench]]
+name = "bench_writes"
+harness = false
+
 [[bench]]
 name = "bench_layer_map"
 harness = false
--- a/pageserver/benches/README.md
+++ b/pageserver/benches/README.md
@@ -10,3 +10,7 @@ To run a specific file:

 To run a specific function:
 `cargo bench --bench bench_layer_map -- real_map_uniform_queries`
+
+To add a new benchmark:
+1. Create new file containing `criterion_main!`
+2. Add it to `Cargo.toml`
--- a/pageserver/benches/bench_writes.rs
+++ b/pageserver/benches/bench_writes.rs
@@ -0,0 +1,76 @@
+use bytes::{Bytes, BytesMut};
+use camino::{Utf8Path, Utf8PathBuf};
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use pageserver::{tenant::storage_layer::InMemoryLayer, config::PageServerConf, context::{RequestContext, DownloadBehavior}, task_mgr::TaskKind, repository::Key, virtual_file};
+use pageserver::repository::Value;
+use utils::{id::{TimelineId, TenantId}, lsn::Lsn};
+
+fn bench_writes(c: &mut Criterion) {
+    // Boilerplate
+    // TODO this setup can be avoided if I reuse TenantHarness but it's difficult
+    //      because it's only compiled for tests, and it's hacky because tbh we
+    //      shouldn't need this many inputs for a function that just writes bytes
+    //      from memory to disk. Performance-critical functions should be
+    //      self-contained (almost like they're separate libraries) and all the
+    //      monolithic pageserver machinery should live outside.
+    virtual_file::init(10);
+    let repo_dir = Utf8PathBuf::from(&"/home/bojan/tmp/repo_dir");
+    let conf = PageServerConf::dummy_conf(repo_dir);
+    let conf: &'static PageServerConf = Box::leak(Box::new(conf));
+    let timeline_id = TimelineId::generate();
+    let tenant_id = TenantId::generate();
+    let start_lsn = Lsn(0);
+    let ctx = RequestContext::new(TaskKind::LayerFlushTask, DownloadBehavior::Error);
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .unwrap();
+
+    fn test_img(s: &str) -> Bytes {
+        let mut buf = BytesMut::new();
+        buf.extend_from_slice(s.as_bytes());
+        buf.resize(64, 0);
+
+        buf.freeze()
+    }
+
+    // Make the InMemoryLayer that will be flushed
+    let layer = rt.block_on(async {
+        let l = InMemoryLayer::create(&conf, timeline_id, tenant_id, start_lsn).await.unwrap();
+
+        let mut lsn = Lsn(0x10);
+        let mut key = Key::from_hex("012222222233333333444444445500000000").unwrap();
+        let mut blknum = 0;
+        for _ in 0..100 {
+            key.field6 = blknum;
+            let val = Value::Image(test_img(&format!("{} at {}", blknum, lsn)));
+            l.put_value(key, lsn, &val, &ctx).await.unwrap();
+
+            lsn = Lsn(lsn.0 + 0x10);
+            blknum += 1;
+        }
+        l
+    });
+
+    rt.block_on(async {
+        layer.write_to_disk_bench(&ctx).await.unwrap();
+    });
+
+
+    let mut group = c.benchmark_group("g1");
+    group.bench_function("f1", |b| {
+        b.iter(|| {
+            // TODO
+        });
+    });
+    group.bench_function("f2", |b| {
+        b.iter(|| {
+            // TODO
+        });
+    });
+    group.finish();
+}
+
+
+criterion_group!(group_1, bench_writes);
+criterion_main!(group_1);
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -1,21 +1,22 @@
-use utils::auth::{AuthError, Claims, Scope};
+use anyhow::{bail, Result};
+use utils::auth::{Claims, Scope};
 use utils::id::TenantId;

-pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
+pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<()> {
    match (&claims.scope, tenant_id) {
-        (Scope::Tenant, None) => Err(AuthError(
-            "Attempt to access management api with tenant scope. Permission denied".into(),
-        )),
+        (Scope::Tenant, None) => {
+            bail!("Attempt to access management api with tenant scope. Permission denied")
+        }
        (Scope::Tenant, Some(tenant_id)) => {
            if claims.tenant_id.unwrap() != tenant_id {
-                return Err(AuthError("Tenant id mismatch. Permission denied".into()));
+                bail!("Tenant id mismatch. Permission denied")
            }
            Ok(())
        }
        (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
        (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
-        (Scope::SafekeeperData, _) => Err(AuthError(
-            "SafekeeperData scope makes no sense for Pageserver".into(),
-        )),
+        (Scope::SafekeeperData, _) => {
+            bail!("SafekeeperData scope makes no sense for Pageserver")
+        }
    }
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -34,11 +34,8 @@ use postgres_backend::AuthType;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::signals::ShutdownSignals;
 use utils::{
-    auth::{JwtAuth, SwappableJwtAuth},
-    logging, project_build_tag, project_git_version,
-    sentry_init::init_sentry,
-    signals::Signal,
-    tcp_listener,
+    auth::JwtAuth, logging, project_build_tag, project_git_version, sentry_init::init_sentry,
+    signals::Signal, tcp_listener,
 };

 project_git_version!(GIT_VERSION);
@@ -324,12 +321,13 @@ fn start_pageserver(
    let http_auth;
    let pg_auth;
    if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
-        // unwrap is ok because check is performed when creating config, so path is set and exists
+        // unwrap is ok because check is performed when creating config, so path is set and file exists
        let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
-        info!("Loading public key(s) for verifying JWT tokens from {key_path:?}");
-
-        let jwt_auth = JwtAuth::from_key_path(key_path)?;
-        let auth: Arc<SwappableJwtAuth> = Arc::new(SwappableJwtAuth::new(jwt_auth));
+        info!(
+            "Loading public key for verifying JWT tokens from {:#?}",
+            key_path
+        );
+        let auth: Arc<JwtAuth> = Arc::new(JwtAuth::from_key_path(key_path)?);

        http_auth = match &conf.http_auth_type {
            AuthType::Trust => None,
@@ -412,7 +410,7 @@ fn start_pageserver(

    // Scan the local 'tenants/' directory and start loading the tenants
    let deletion_queue_client = deletion_queue.new_client();
-    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
+    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
        TenantSharedResources {
            broker_client: broker_client.clone(),
@@ -422,7 +420,6 @@ fn start_pageserver(
        order,
        shutdown_pageserver.clone(),
    ))?;
-    let tenant_manager = Arc::new(tenant_manager);

    BACKGROUND_RUNTIME.spawn({
        let init_done_rx = init_done_rx;
@@ -551,7 +548,6 @@ fn start_pageserver(
        let router_state = Arc::new(
            http::routes::State::new(
                conf,
-                tenant_manager,
                http_auth.clone(),
                remote_storage.clone(),
                broker_client.clone(),
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -161,7 +161,7 @@ pub struct PageServerConf {
    pub http_auth_type: AuthType,
    /// authentication method for libpq connections from compute
    pub pg_auth_type: AuthType,
-    /// Path to a file or directory containing public key(s) for verifying JWT tokens.
+    /// Path to a file containing public key for verifying JWT tokens.
    /// Used for both mgmt and compute auth, if enabled.
    pub auth_validation_public_key_path: Option<Utf8PathBuf>,

@@ -1314,6 +1314,12 @@ broker_endpoint = '{broker_endpoint}'
            assert_eq!(
                parsed_remote_storage_config,
                RemoteStorageConfig {
+                    max_concurrent_syncs: NonZeroUsize::new(
+                        remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS
+                    )
+                        .unwrap(),
+                    max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
+                        .unwrap(),
                    storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
                },
                "Remote storage config should correctly parse the local FS config and fill other storage defaults"
@@ -1374,6 +1380,8 @@ broker_endpoint = '{broker_endpoint}'
            assert_eq!(
                parsed_remote_storage_config,
                RemoteStorageConfig {
+                    max_concurrent_syncs,
+                    max_sync_errors,
                    storage: RemoteStorageKind::AwsS3(S3Config {
                        bucket_name: bucket_name.clone(),
                        bucket_region: bucket_region.clone(),
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -266,7 +266,7 @@ async fn calculate_synthetic_size_worker(
                continue;
            }

-            if let Ok(tenant) = mgr::get_tenant(tenant_id, true) {
+            if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
                // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
                // We can put in some prioritization for consumption metrics.
                // Same for the loop that fetches computed metrics.
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -3,6 +3,7 @@ use anyhow::Context;
 use chrono::{DateTime, Utc};
 use consumption_metrics::EventType;
 use futures::stream::StreamExt;
+use serde_with::serde_as;
 use std::{sync::Arc, time::SystemTime};
 use utils::{
    id::{TenantId, TimelineId},
@@ -41,10 +42,13 @@ pub(super) enum Name {
 ///
 /// This is a denormalization done at the MetricsKey const methods; these should not be constructed
 /// elsewhere.
+#[serde_with::serde_as]
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
 pub(crate) struct MetricsKey {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub(super) tenant_id: TenantId,

+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
    #[serde(skip_serializing_if = "Option::is_none")]
    pub(super) timeline_id: Option<TimelineId>,

@@ -202,6 +206,7 @@ pub(super) async fn collect_all_metrics(
            None
        } else {
            crate::tenant::mgr::get_tenant(id, true)
+                .await
                .ok()
                .map(|tenant| (id, tenant))
        }
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -1,4 +1,5 @@
 use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
+use serde_with::serde_as;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;

@@ -6,9 +7,12 @@ use super::{metrics::Name, Cache, MetricsKey, RawMetric};
 use utils::id::{TenantId, TimelineId};

 /// How the metrics from pageserver are identified.
+#[serde_with::serde_as]
 #[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq)]
 struct Ids {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub(super) tenant_id: TenantId,
+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
    #[serde(skip_serializing_if = "Option::is_none")]
    pub(super) timeline_id: Option<TimelineId>,
 }
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -10,7 +10,6 @@ use crate::control_plane_client::ControlPlaneGenerationsApi;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::remote_timeline_path;
-use crate::virtual_file::MaybeFatalIo;
 use crate::virtual_file::VirtualFile;
 use anyhow::Context;
 use camino::Utf8PathBuf;
@@ -18,6 +17,7 @@ use hex::FromHex;
 use remote_storage::{GenericRemoteStorage, RemotePath};
 use serde::Deserialize;
 use serde::Serialize;
+use serde_with::serde_as;
 use thiserror::Error;
 use tokio;
 use tokio_util::sync::CancellationToken;
@@ -214,6 +214,7 @@ where
 /// during recovery as startup.
 const TEMP_SUFFIX: &str = "tmp";

+#[serde_as]
 #[derive(Debug, Serialize, Deserialize)]
 struct DeletionList {
    /// Serialization version, for future use
@@ -242,6 +243,7 @@ struct DeletionList {
    validated: bool,
 }

+#[serde_as]
 #[derive(Debug, Serialize, Deserialize)]
 struct DeletionHeader {
    /// Serialization version, for future use
@@ -269,9 +271,7 @@ impl DeletionHeader {
        let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
            .await
-            .maybe_fatal_err("save deletion header")?;
-
-        Ok(())
+            .map_err(Into::into)
    }
 }

@@ -360,7 +360,6 @@ impl DeletionList {
        let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
        VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
            .await
-            .maybe_fatal_err("save deletion list")
            .map_err(Into::into)
    }
 }
@@ -893,6 +892,14 @@ mod test {
        std::fs::create_dir_all(remote_fs_dir)?;
        let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
        let storage_config = RemoteStorageConfig {
+            max_concurrent_syncs: std::num::NonZeroUsize::new(
+                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
+            )
+            .unwrap(),
+            max_sync_errors: std::num::NonZeroU32::new(
+                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
+            )
+            .unwrap(),
            storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
        };
        let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -55,24 +55,21 @@ impl Deleter {

    /// Wrap the remote `delete_objects` with a failpoint
    async fn remote_delete(&self) -> Result<(), anyhow::Error> {
+        fail::fail_point!("deletion-queue-before-execute", |_| {
+            info!("Skipping execution, failpoint set");
+            metrics::DELETION_QUEUE
+                .remote_errors
+                .with_label_values(&["failpoint"])
+                .inc();
+            Err(anyhow::anyhow!("failpoint hit"))
+        });
+
        // A backoff::retry is used here for two reasons:
        // - To provide a backoff rather than busy-polling the API on errors
        // - To absorb transient 429/503 conditions without hitting our error
        //   logging path for issues deleting objects.
        backoff::retry(
-            || async {
-                fail::fail_point!("deletion-queue-before-execute", |_| {
-                    info!("Skipping execution, failpoint set");
-
-                    metrics::DELETION_QUEUE
-                        .remote_errors
-                        .with_label_values(&["failpoint"])
-                        .inc();
-                    Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
-                });
-
-                self.remote_storage.delete_objects(&self.accumulator).await
-            },
+            || async { self.remote_storage.delete_objects(&self.accumulator).await },
            |_| false,
            3,
            10,
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -34,8 +34,6 @@ use crate::deletion_queue::TEMP_SUFFIX;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::storage_layer::LayerFileName;
-use crate::virtual_file::on_fatal_io_error;
-use crate::virtual_file::MaybeFatalIo;

 // The number of keys in a DeletionList before we will proactively persist it
 // (without reaching a flush deadline).  This aims to deliver objects of the order
@@ -197,7 +195,7 @@ impl ListWriter {
                    debug!("Deletion header {header_path} not found, first start?");
                    Ok(None)
                } else {
-                    on_fatal_io_error(&e, "reading deletion header");
+                    Err(anyhow::anyhow!(e))
                }
            }
        }
@@ -218,9 +216,16 @@ impl ListWriter {
        self.pending.sequence = validated_sequence + 1;

        let deletion_directory = self.conf.deletion_prefix();
-        let mut dir = tokio::fs::read_dir(&deletion_directory)
-            .await
-            .fatal_err("read deletion directory");
+        let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
+            Ok(d) => d,
+            Err(e) => {
+                warn!("Failed to open deletion list directory {deletion_directory}: {e:#}");
+
+                // Give up: if we can't read the deletion list directory, we probably can't
+                // write lists into it later, so the queue won't work.
+                return Err(e.into());
+            }
+        };

        let list_name_pattern =
            Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
@@ -228,7 +233,7 @@ impl ListWriter {
        let temp_extension = format!(".{TEMP_SUFFIX}");
        let header_path = self.conf.deletion_header_path();
        let mut seqs: Vec<u64> = Vec::new();
-        while let Some(dentry) = dir.next_entry().await.fatal_err("read deletion dentry") {
+        while let Some(dentry) = dir.next_entry().await? {
            let file_name = dentry.file_name();
            let dentry_str = file_name.to_string_lossy();

@@ -241,9 +246,11 @@ impl ListWriter {
                info!("Cleaning up temporary file {dentry_str}");
                let absolute_path =
                    deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path"));
-                tokio::fs::remove_file(&absolute_path)
-                    .await
-                    .fatal_err("delete temp file");
+                if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
+                    // Non-fatal error: we will just leave the file behind but not
+                    // try and load it.
+                    warn!("Failed to clean up temporary file {absolute_path}: {e:#}");
+                }

                continue;
            }
@@ -283,9 +290,7 @@ impl ListWriter {
        for s in seqs {
            let list_path = self.conf.deletion_list_path(s);

-            let list_bytes = tokio::fs::read(&list_path)
-                .await
-                .fatal_err("read deletion list");
+            let list_bytes = tokio::fs::read(&list_path).await?;

            let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
                Ok(l) => l,
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -28,7 +28,6 @@ use crate::config::PageServerConf;
 use crate::control_plane_client::ControlPlaneGenerationsApi;
 use crate::control_plane_client::RetryForeverError;
 use crate::metrics;
-use crate::virtual_file::MaybeFatalIo;

 use super::deleter::DeleterMessage;
 use super::DeletionHeader;
@@ -288,9 +287,16 @@ where
    async fn cleanup_lists(&mut self, list_paths: Vec<Utf8PathBuf>) {
        for list_path in list_paths {
            debug!("Removing deletion list {list_path}");
-            tokio::fs::remove_file(&list_path)
-                .await
-                .fatal_err("remove deletion list");
+
+            if let Err(e) = tokio::fs::remove_file(&list_path).await {
+                // Unexpected: we should have permissions and nothing else should
+                // be touching these files.  We will leave the file behind.  Subsequent
+                // pageservers will try and load it again: hopefully whatever storage
+                // issue (probably permissions) has been fixed by then.
+                tracing::error!("Failed to delete {list_path}: {e:#}");
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
+                break;
+            }
        }
    }

--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -403,7 +403,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                    return (evicted_bytes, evictions_failed);
                };

-                let results = timeline.evict_layers(&batch).await;
+                let results = timeline.evict_layers(&batch, &cancel).await;

                match results {
                    Ok(results) => {
@@ -545,7 +545,7 @@ async fn collect_eviction_candidates(
        if cancel.is_cancelled() {
            return Ok(EvictionCandidates::Cancelled);
        }
-        let tenant = match tenant::mgr::get_tenant(*tenant_id, true) {
+        let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await {
            Ok(tenant) => tenant,
            Err(e) => {
                // this can happen if tenant has lifecycle transition after we fetched it
@@ -554,11 +554,6 @@ async fn collect_eviction_candidates(
            }
        };

-        if tenant.cancel.is_cancelled() {
-            info!(%tenant_id, "Skipping tenant for eviction, it is shutting down");
-            continue;
-        }
-
        // collect layers from all timelines in this tenant
        //
        // If one of the timelines becomes `!is_active()` during the iteration,
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -52,31 +52,6 @@ paths:
              schema:
                type: object

-  /v1/reload_auth_validation_keys:
-    post:
-      description: Reloads the JWT public keys from their pre-configured location on disk.
-      responses:
-        "200":
-          description: The reload completed successfully.
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error (also hits if no keys were found)
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-
  /v1/tenant/{tenant_id}:
    parameters:
      - name: tenant_id
@@ -352,8 +327,7 @@ paths:
          in: query
          required: true
          schema:
-            type: string
-            format: hex
+            type: integer
          description: A LSN to get the timestamp
      responses:
        "200":
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -17,10 +17,10 @@ use pageserver_api::models::{
    TenantLoadRequest, TenantLocationConfigRequest,
 };
 use remote_storage::GenericRemoteStorage;
+use serde_with::{serde_as, DisplayFromStr};
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::auth::JwtAuth;
 use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -36,8 +36,7 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
 use crate::tenant::mgr::{
-    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
-    TenantSlotError, TenantSlotUpsertError, TenantStateError,
+    GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
 };
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
@@ -46,7 +45,7 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSha
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use utils::{
-    auth::SwappableJwtAuth,
+    auth::JwtAuth,
    generation::Generation,
    http::{
        endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
@@ -64,8 +63,7 @@ use super::models::ConfigureFailpointsRequest;

 pub struct State {
    conf: &'static PageServerConf,
-    tenant_manager: Arc<TenantManager>,
-    auth: Option<Arc<SwappableJwtAuth>>,
+    auth: Option<Arc<JwtAuth>>,
    allowlist_routes: Vec<Uri>,
    remote_storage: Option<GenericRemoteStorage>,
    broker_client: storage_broker::BrokerClientChannel,
@@ -76,8 +74,7 @@ pub struct State {
 impl State {
    pub fn new(
        conf: &'static PageServerConf,
-        tenant_manager: Arc<TenantManager>,
-        auth: Option<Arc<SwappableJwtAuth>>,
+        auth: Option<Arc<JwtAuth>>,
        remote_storage: Option<GenericRemoteStorage>,
        broker_client: storage_broker::BrokerClientChannel,
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
@@ -89,7 +86,6 @@ impl State {
            .collect::<Vec<_>>();
        Ok(Self {
            conf,
-            tenant_manager,
            auth,
            allowlist_routes,
            remote_storage,
@@ -151,59 +147,28 @@ impl From<PageReconstructError> for ApiError {
 impl From<TenantMapInsertError> for ApiError {
    fn from(tmie: TenantMapInsertError) -> ApiError {
        match tmie {
-            TenantMapInsertError::SlotError(e) => e.into(),
-            TenantMapInsertError::SlotUpsertError(e) => e.into(),
+            TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
+                ApiError::ResourceUnavailable(format!("{tmie}").into())
+            }
+            TenantMapInsertError::TenantAlreadyExists(id, state) => {
+                ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
+            }
+            TenantMapInsertError::TenantExistsSecondary(id) => {
+                ApiError::Conflict(format!("tenant {id} already exists as secondary"))
+            }
            TenantMapInsertError::Other(e) => ApiError::InternalServerError(e),
        }
    }
 }

-impl From<TenantSlotError> for ApiError {
-    fn from(e: TenantSlotError) -> ApiError {
-        use TenantSlotError::*;
-        match e {
-            NotFound(tenant_id) => {
-                ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
-            }
-            e @ (AlreadyExists(_, _) | Conflict(_)) => ApiError::Conflict(format!("{e}")),
-            InProgress => {
-                ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
-            }
-            MapState(e) => e.into(),
-        }
-    }
-}
-
-impl From<TenantSlotUpsertError> for ApiError {
-    fn from(e: TenantSlotUpsertError) -> ApiError {
-        use TenantSlotUpsertError::*;
-        match e {
-            InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")),
-            MapState(e) => e.into(),
-        }
-    }
-}
-
-impl From<TenantMapError> for ApiError {
-    fn from(e: TenantMapError) -> ApiError {
-        use TenantMapError::*;
-        match e {
-            StillInitializing | ShuttingDown => {
-                ApiError::ResourceUnavailable(format!("{e}").into())
-            }
-        }
-    }
-}
-
 impl From<TenantStateError> for ApiError {
    fn from(tse: TenantStateError) -> ApiError {
        match tse {
+            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
            TenantStateError::IsStopping(_) => {
                ApiError::ResourceUnavailable("Tenant is stopping".into())
            }
-            TenantStateError::SlotError(e) => e.into(),
-            TenantStateError::SlotUpsertError(e) => e.into(),
-            TenantStateError::Other(e) => ApiError::InternalServerError(anyhow!(e)),
+            _ => ApiError::InternalServerError(anyhow::Error::new(tse)),
        }
    }
 }
@@ -224,7 +189,6 @@ impl From<GetTenantError> for ApiError {
                // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
                ApiError::ResourceUnavailable("Tenant not yet active".into())
            }
-            GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()),
        }
    }
 }
@@ -279,9 +243,6 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
            Get(g) => ApiError::from(g),
            e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
            Timeline(t) => ApiError::from(t),
-            NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()),
-            SlotError(e) => e.into(),
-            SlotUpsertError(e) => e.into(),
            Other(o) => ApiError::InternalServerError(o),
            e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
        }
@@ -303,7 +264,11 @@ async fn build_timeline_info(
        // we're executing this function, we will outlive the timeline on-disk state.
        info.current_logical_size_non_incremental = Some(
            timeline
-                .get_current_logical_size_non_incremental(info.last_record_lsn, ctx)
+                .get_current_logical_size_non_incremental(
+                    info.last_record_lsn,
+                    CancellationToken::new(),
+                    ctx,
+                )
                .await?,
        );
    }
@@ -389,32 +354,6 @@ async fn status_handler(
    json_response(StatusCode::OK, StatusResponse { id: config.id })
 }

-async fn reload_auth_validation_keys_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    check_permission(&request, None)?;
-    let config = get_config(&request);
-    let state = get_state(&request);
-    let Some(shared_auth) = &state.auth else {
-        return json_response(StatusCode::BAD_REQUEST, ());
-    };
-    // unwrap is ok because check is performed when creating config, so path is set and exists
-    let key_path = config.auth_validation_public_key_path.as_ref().unwrap();
-    info!("Reloading public key(s) for verifying JWT tokens from {key_path:?}");
-
-    match JwtAuth::from_key_path(key_path) {
-        Ok(new_auth) => {
-            shared_auth.swap(new_auth);
-            json_response(StatusCode::OK, ())
-        }
-        Err(e) => {
-            warn!("Error reloading public keys from {key_path:?}: {e:}");
-            json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
-        }
-    }
-}
-
 async fn timeline_create_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -430,7 +369,7 @@ async fn timeline_create_handler(
    let state = get_state(&request);

    async {
-        let tenant = mgr::get_tenant(tenant_id, true)?;
+        let tenant = mgr::get_tenant(tenant_id, true).await?;
        match tenant.create_timeline(
            new_timeline_id,
            request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -458,9 +397,6 @@ async fn timeline_create_handler(
            Err(e @ tenant::CreateTimelineError::AncestorNotActive) => {
                json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()))
            }
-            Err(tenant::CreateTimelineError::ShuttingDown) => {
-                json_response(StatusCode::SERVICE_UNAVAILABLE,HttpErrorBody::from_msg("tenant shutting down".to_string()))
-            }
            Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
        }
    }
@@ -480,7 +416,7 @@ async fn timeline_list_handler(
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

    let response_data = async {
-        let tenant = mgr::get_tenant(tenant_id, true)?;
+        let tenant = mgr::get_tenant(tenant_id, true).await?;
        let timelines = tenant.list_timelines();

        let mut response_data = Vec::with_capacity(timelines.len());
@@ -519,7 +455,7 @@ async fn timeline_detail_handler(
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

    let timeline_info = async {
-        let tenant = mgr::get_tenant(tenant_id, true)?;
+        let tenant = mgr::get_tenant(tenant_id, true).await?;

        let timeline = tenant
            .get_timeline(timeline_id, false)
@@ -563,8 +499,10 @@ async fn get_lsn_by_timestamp_handler(
    let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;

    if version.unwrap_or(0) > 1 {
+        #[serde_as]
        #[derive(serde::Serialize)]
        struct Result {
+            #[serde_as(as = "DisplayFromStr")]
            lsn: Lsn,
            kind: &'static str,
        }
@@ -775,7 +713,7 @@ async fn tenant_status(
    check_permission(&request, Some(tenant_id))?;

    let tenant_info = async {
-        let tenant = mgr::get_tenant(tenant_id, false)?;
+        let tenant = mgr::get_tenant(tenant_id, false).await?;

        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
@@ -838,7 +776,7 @@ async fn tenant_size_handler(
    let headers = request.headers();

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let tenant = mgr::get_tenant(tenant_id, true)?;
+    let tenant = mgr::get_tenant(tenant_id, true).await?;

    // this can be long operation
    let inputs = tenant
@@ -873,8 +811,10 @@ async fn tenant_size_handler(
    }

    /// The type resides in the pageserver not to expose `ModelInputs`.
+    #[serde_with::serde_as]
    #[derive(serde::Serialize)]
    struct TenantHistorySize {
+        #[serde_as(as = "serde_with::DisplayFromStr")]
        id: TenantId,
        /// Size is a mixture of WAL and logical size, so the unit is bytes.
        ///
@@ -1095,7 +1035,7 @@ async fn get_tenant_config_handler(
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let tenant = mgr::get_tenant(tenant_id, false)?;
+    let tenant = mgr::get_tenant(tenant_id, false).await?;

    let response = HashMap::from([
        (
@@ -1154,7 +1094,7 @@ async fn put_tenant_location_config_handler(
            .await
        {
            match e {
-                TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
+                TenantStateError::NotFound(_) => {
                    // This API is idempotent: a NotFound on a detach is fine.
                }
                _ => return Err(e.into()),
@@ -1166,14 +1106,20 @@ async fn put_tenant_location_config_handler(
    let location_conf =
        LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

-    state
-        .tenant_manager
-        .upsert_location(tenant_id, location_conf, &ctx)
-        .await
-        // TODO: badrequest assumes the caller was asking for something unreasonable, but in
-        // principle we might have hit something like concurrent API calls to the same tenant,
-        // which is not a 400 but a 409.
-        .map_err(ApiError::BadRequest)?;
+    mgr::upsert_location(
+        state.conf,
+        tenant_id,
+        location_conf,
+        state.broker_client.clone(),
+        state.remote_storage.clone(),
+        state.deletion_queue_client.clone(),
+        &ctx,
+    )
+    .await
+    // TODO: badrequest assumes the caller was asking for something unreasonable, but in
+    // principle we might have hit something like concurrent API calls to the same tenant,
+    // which is not a 400 but a 409.
+    .map_err(ApiError::BadRequest)?;

    json_response(StatusCode::OK, ())
 }
@@ -1186,6 +1132,7 @@ async fn handle_tenant_break(
    let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;

    let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
+        .await
        .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;

    tenant.set_broken("broken from test".to_owned()).await;
@@ -1490,7 +1437,7 @@ async fn active_timeline_of_active_tenant(
    tenant_id: TenantId,
    timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = mgr::get_tenant(tenant_id, true)?;
+    let tenant = mgr::get_tenant(tenant_id, true).await?;
    tenant
        .get_timeline(timeline_id, true)
        .map_err(|e| ApiError::NotFound(e.into()))
@@ -1667,8 +1614,6 @@ where
        );

        match handle.await {
-            // TODO: never actually return Err from here, always Ok(...) so that we can log
-            // spanned errors. Call api_error_handler instead and return appropriate Body.
            Ok(result) => result,
            Err(e) => {
                // The handler task panicked. We have a global panic handler that logs the
@@ -1717,7 +1662,7 @@ where
 pub fn make_router(
    state: Arc<State>,
    launch_ts: &'static LaunchTimestamp,
-    auth: Option<Arc<SwappableJwtAuth>>,
+    auth: Option<Arc<JwtAuth>>,
 ) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
    let spec = include_bytes!("openapi_spec.yml");
    let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
@@ -1746,9 +1691,6 @@ pub fn make_router(
        .put("/v1/failpoints", |r| {
            testing_api_handler("manage failpoints", r, failpoints_handler)
        })
-        .post("/v1/reload_auth_validation_keys", |r| {
-            api_handler(r, reload_auth_validation_keys_handler)
-        })
        .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
        .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,5 +1,3 @@
-#![deny(clippy::undocumented_unsafe_blocks)]
-
 mod auth;
 pub mod basebackup;
 pub mod config;
@@ -63,6 +61,14 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
    )
    .await;

+    // Shut down any page service tasks.
+    timed(
+        task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
+        "shutdown PageRequestHandlers",
+        Duration::from_secs(1),
+    )
+    .await;
+
    // Shut down all the tenants. This flushes everything to disk and kills
    // the checkpoint and GC tasks.
    timed(
@@ -72,15 +78,6 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
    )
    .await;

-    // Shut down any page service tasks: any in-progress work for particular timelines or tenants
-    // should already have been canclled via mgr::shutdown_all_tenants
-    timed(
-        task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
-        "shutdown PageRequestHandlers",
-        Duration::from_secs(1),
-    )
-    .await;
-
    // Best effort to persist any outstanding deletions, to avoid leaking objects
    if let Some(mut deletion_queue) = deletion_queue {
        deletion_queue.shutdown(Duration::from_secs(5)).await;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -962,32 +962,6 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy
    .expect("failed to define a metric")
 });

-pub(crate) struct TenantManagerMetrics {
-    pub(crate) tenant_slots: UIntGauge,
-    pub(crate) tenant_slot_writes: IntCounter,
-    pub(crate) unexpected_errors: IntCounter,
-}
-
-pub(crate) static TENANT_MANAGER: Lazy<TenantManagerMetrics> = Lazy::new(|| {
-    TenantManagerMetrics {
-    tenant_slots: register_uint_gauge!(
-        "pageserver_tenant_manager_slots",
-        "How many slots currently exist, including all attached, secondary and in-progress operations",
-    )
-    .expect("failed to define a metric"),
-    tenant_slot_writes: register_int_counter!(
-        "pageserver_tenant_manager_slot_writes",
-        "Writes to a tenant slot, including all of create/attach/detach/delete"
-    )
-    .expect("failed to define a metric"),
-    unexpected_errors: register_int_counter!(
-        "pageserver_tenant_manager_unexpected_errors_total",
-        "Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug."
-    )
-    .expect("failed to define a metric"),
-}
-});
-
 pub(crate) struct DeletionQueueMetrics {
    pub(crate) keys_submitted: IntCounter,
    pub(crate) keys_dropped: IntCounter,
@@ -1225,6 +1199,15 @@ pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_wal_redo_wait_seconds",
+        "Time spent waiting for access to the Postgres WAL redo process",
+        redo_histogram_time_buckets!(),
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_records_histogram",
@@ -1901,9 +1884,6 @@ pub fn preinitialize_metrics() {
    // Deletion queue stats
    Lazy::force(&DELETION_QUEUE);

-    // Tenant manager stats
-    Lazy::force(&TENANT_MANAGER);
-
    // countervecs
    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
        .into_iter()
@@ -1919,6 +1899,7 @@ pub fn preinitialize_metrics() {
        &READ_NUM_FS_LAYERS,
        &WAIT_LSN_TIME,
        &WAL_REDO_TIME,
+        &WAL_REDO_WAIT_TIME,
        &WAL_REDO_RECORDS_HISTOGRAM,
        &WAL_REDO_BYTES_HISTOGRAM,
    ]
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -40,7 +40,7 @@ use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
 use utils::{
-    auth::{Claims, Scope, SwappableJwtAuth},
+    auth::{Claims, JwtAuth, Scope},
    id::{TenantId, TimelineId},
    lsn::Lsn,
    simple_rcu::RcuReadGuard,
@@ -55,20 +55,16 @@ use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
+use crate::tenant;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::mgr;
-use crate::tenant::mgr::get_active_tenant_with_timeout;
-use crate::tenant::mgr::GetActiveTenantError;
-use crate::tenant::Timeline;
+use crate::tenant::mgr::GetTenantError;
+use crate::tenant::{Tenant, Timeline};
 use crate::trace::Tracer;

 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

-// How long we may block waiting for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
-// is not yet in state [`TenantState::Active`].
-const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
-
 /// Read the end of a tar archive.
 ///
 /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
@@ -122,7 +118,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
 pub async fn libpq_listener_main(
    conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<SwappableJwtAuth>>,
+    auth: Option<Arc<JwtAuth>>,
    listener: TcpListener,
    auth_type: AuthType,
    listener_ctx: RequestContext,
@@ -190,7 +186,7 @@ pub async fn libpq_listener_main(
 async fn page_service_conn_main(
    conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<SwappableJwtAuth>>,
+    auth: Option<Arc<JwtAuth>>,
    socket: tokio::net::TcpStream,
    auth_type: AuthType,
    connection_ctx: RequestContext,
@@ -218,34 +214,22 @@ async fn page_service_conn_main(
    // no write timeout is used, because the kernel is assumed to error writes after some time.
    let mut socket = tokio_io_timeout::TimeoutReader::new(socket);

-    let default_timeout_ms = 10 * 60 * 1000; // 10 minutes by default
-    let socket_timeout_ms = (|| {
-        fail::fail_point!("simulated-bad-compute-connection", |avg_timeout_ms| {
-            // Exponential distribution for simulating
-            // poor network conditions, expect about avg_timeout_ms to be around 15
-            // in tests
-            if let Some(avg_timeout_ms) = avg_timeout_ms {
-                let avg = avg_timeout_ms.parse::<i64>().unwrap() as f32;
-                let u = rand::random::<f32>();
-                ((1.0 - u).ln() / (-avg)) as u64
-            } else {
-                default_timeout_ms
-            }
-        });
-        default_timeout_ms
-    })();
-
-    // A timeout here does not mean the client died, it can happen if it's just idle for
-    // a while: we will tear down this PageServerHandler and instantiate a new one if/when
-    // they reconnect.
-    socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
+    // timeout should be lower, but trying out multiple days for
+    // <https://github.com/neondatabase/neon/issues/4205>
+    socket.set_timeout(Some(std::time::Duration::from_secs(60 * 60 * 24 * 3)));
    let socket = std::pin::pin!(socket);

    // XXX: pgbackend.run() should take the connection_ctx,
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
-    let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
+    let mut conn_handler = PageServerHandler::new(
+        conf,
+        broker_client,
+        auth,
+        connection_ctx,
+        task_mgr::shutdown_token(),
+    );
    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;

    match pgbackend
@@ -271,7 +255,7 @@ async fn page_service_conn_main(
 struct PageServerHandler {
    _conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<SwappableJwtAuth>>,
+    auth: Option<Arc<JwtAuth>>,
    claims: Option<Claims>,

    /// The context created for the lifetime of the connection
@@ -279,14 +263,19 @@ struct PageServerHandler {
    /// For each query received over the connection,
    /// `process_query` creates a child context from this one.
    connection_ctx: RequestContext,
+
+    /// A token that should fire when the tenant transitions from
+    /// attached state, or when the pageserver is shutting down.
+    cancel: CancellationToken,
 }

 impl PageServerHandler {
    pub fn new(
        conf: &'static PageServerConf,
        broker_client: storage_broker::BrokerClientChannel,
-        auth: Option<Arc<SwappableJwtAuth>>,
+        auth: Option<Arc<JwtAuth>>,
        connection_ctx: RequestContext,
+        cancel: CancellationToken,
    ) -> Self {
        PageServerHandler {
            _conf: conf,
@@ -294,6 +283,7 @@ impl PageServerHandler {
            auth,
            claims: None,
            connection_ctx,
+            cancel,
        }
    }

@@ -301,11 +291,7 @@ impl PageServerHandler {
    /// this rather than naked flush() in order to shut down promptly.  Without this, we would
    /// block shutdown of a tenant if a postgres client was failing to consume bytes we send
    /// in the flush.
-    async fn flush_cancellable<IO>(
-        &self,
-        pgb: &mut PostgresBackend<IO>,
-        cancel: &CancellationToken,
-    ) -> Result<(), QueryError>
+    async fn flush_cancellable<IO>(&self, pgb: &mut PostgresBackend<IO>) -> Result<(), QueryError>
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
@@ -313,7 +299,7 @@ impl PageServerHandler {
            flush_r = pgb.flush() => {
                Ok(flush_r?)
            },
-            _ = cancel.cancelled() => {
+            _ = self.cancel.cancelled() => {
                Err(QueryError::Shutdown)
            }
        )
@@ -322,7 +308,6 @@ impl PageServerHandler {
    fn copyin_stream<'a, IO>(
        &'a self,
        pgb: &'a mut PostgresBackend<IO>,
-        cancel: &'a CancellationToken,
    ) -> impl Stream<Item = io::Result<Bytes>> + 'a
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
@@ -332,7 +317,7 @@ impl PageServerHandler {
                let msg = tokio::select! {
                    biased;

-                    _ = cancel.cancelled() => {
+                    _ = self.cancel.cancelled() => {
                        // We were requested to shut down.
                        let msg = "pageserver is shutting down";
                        let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
@@ -372,7 +357,7 @@ impl PageServerHandler {
                        let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
                        // error can't happen here, ErrorResponse serialization should be always ok
                        pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
-                        self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
+                        self.flush_cancellable(pgb).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
                        Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
                    }
                    Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
@@ -399,13 +384,12 @@ impl PageServerHandler {
    {
        debug_assert_current_span_has_tenant_and_timeline_id();

+        // NOTE: pagerequests handler exits when connection is closed,
+        //       so there is no need to reset the association
+        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
+
        // Make request tracer if needed
-        let tenant = mgr::get_active_tenant_with_timeout(
-            tenant_id,
-            ACTIVE_TENANT_TIMEOUT,
-            &task_mgr::shutdown_token(),
-        )
-        .await?;
+        let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
        let mut tracer = if tenant.get_trace_read_requests() {
            let connection_id = ConnectionId::generate();
            let path = tenant
@@ -421,14 +405,9 @@ impl PageServerHandler {
            .get_timeline(timeline_id, true)
            .map_err(|e| anyhow::anyhow!(e))?;

-        // Avoid starting new requests if the timeline has already started shutting down,
-        // and block timeline shutdown until this request is complete, or drops out due
-        // to cancellation.
-        let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
-
        // switch client to COPYBOTH
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
-        self.flush_cancellable(pgb, &timeline.cancel).await?;
+        self.flush_cancellable(pgb).await?;

        let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);

@@ -436,7 +415,7 @@ impl PageServerHandler {
            let msg = tokio::select! {
                biased;

-                _ = timeline.cancel.cancelled() => {
+                _ = self.cancel.cancelled() => {
                    // We were requested to shut down.
                    info!("shutdown request received in page handler");
                    return Err(QueryError::Shutdown)
@@ -511,24 +490,9 @@ impl PageServerHandler {
                }
            };

-            if let Err(e) = &response {
-                // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
-                // because wait_lsn etc will drop out
-                // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
-                // is_canceled(): [`Timeline::shutdown`]` has entered
-                if timeline.cancel.is_cancelled() || timeline.is_stopping() {
-                    // If we fail to fulfil a request during shutdown, which may be _because_ of
-                    // shutdown, then do not send the error to the client.  Instead just drop the
-                    // connection.
-                    span.in_scope(|| info!("dropped response during shutdown: {e:#}"));
-                    return Err(QueryError::Shutdown);
-                }
-            }
-
            let response = response.unwrap_or_else(|e| {
                // print the all details to the log with {:#}, but for the client the
-                // error message is enough.  Do not log if shutting down, as the anyhow::Error
-                // here includes cancellation which is not an error.
+                // error message is enough
                span.in_scope(|| error!("error reading relation or page version: {:#}", e));
                PagestreamBeMessage::Error(PagestreamErrorResponse {
                    message: e.to_string(),
@@ -536,7 +500,7 @@ impl PageServerHandler {
            });

            pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
-            self.flush_cancellable(pgb, &timeline.cancel).await?;
+            self.flush_cancellable(pgb).await?;
        }
        Ok(())
    }
@@ -558,14 +522,10 @@ impl PageServerHandler {
    {
        debug_assert_current_span_has_tenant_and_timeline_id();

+        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
        // Create empty timeline
        info!("creating new timeline");
-        let tenant = get_active_tenant_with_timeout(
-            tenant_id,
-            ACTIVE_TENANT_TIMEOUT,
-            &task_mgr::shutdown_token(),
-        )
-        .await?;
+        let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
        let timeline = tenant
            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
            .await?;
@@ -583,9 +543,9 @@ impl PageServerHandler {
        // Import basebackup provided via CopyData
        info!("importing basebackup");
        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        self.flush_cancellable(pgb, &tenant.cancel).await?;
+        self.flush_cancellable(pgb).await?;

-        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
+        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
        timeline
            .import_basebackup_from_tar(
                &mut copyin_reader,
@@ -622,10 +582,9 @@ impl PageServerHandler {
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
        debug_assert_current_span_has_tenant_and_timeline_id();
+        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));

-        let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id)
-            .await?;
+        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
        let last_record_lsn = timeline.get_last_record_lsn();
        if last_record_lsn != start_lsn {
            return Err(QueryError::Other(
@@ -639,8 +598,8 @@ impl PageServerHandler {
        // Import wal provided via CopyData
        info!("importing wal");
        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        self.flush_cancellable(pgb, &timeline.cancel).await?;
-        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel)));
+        self.flush_cancellable(pgb).await?;
+        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
        import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
        info!("wal import complete");

@@ -833,9 +792,7 @@ impl PageServerHandler {
        let started = std::time::Instant::now();

        // check that the timeline exists
-        let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id)
-            .await?;
+        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        if let Some(lsn) = lsn {
            // Backup was requested at a particular LSN. Wait for it to arrive.
@@ -850,7 +807,7 @@ impl PageServerHandler {

        // switch client to COPYOUT
        pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
-        self.flush_cancellable(pgb, &timeline.cancel).await?;
+        self.flush_cancellable(pgb).await?;

        // Send a tarball of the latest layer on the timeline. Compress if not
        // fullbackup. TODO Compress in that case too (tests need to be updated)
@@ -902,7 +859,7 @@ impl PageServerHandler {
        }

        pgb.write_message_noflush(&BeMessage::CopyDone)?;
-        self.flush_cancellable(pgb, &timeline.cancel).await?;
+        self.flush_cancellable(pgb).await?;

        let basebackup_after = started
            .elapsed()
@@ -920,7 +877,7 @@ impl PageServerHandler {

    // when accessing management api supply None as an argument
    // when using to authorize tenant pass corresponding tenant id
-    fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<(), QueryError> {
+    fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
        if self.auth.is_none() {
            // auth is set to Trust, nothing to check so just return ok
            return Ok(());
@@ -932,26 +889,7 @@ impl PageServerHandler {
            .claims
            .as_ref()
            .expect("claims presence already checked");
-        check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0))
-    }
-
-    /// Shorthand for getting a reference to a Timeline of an Active tenant.
-    async fn get_active_tenant_timeline(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
-        let tenant = get_active_tenant_with_timeout(
-            tenant_id,
-            ACTIVE_TENANT_TIMEOUT,
-            &task_mgr::shutdown_token(),
-        )
-        .await
-        .map_err(GetActiveTimelineError::Tenant)?;
-        let timeline = tenant
-            .get_timeline(timeline_id, true)
-            .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
-        Ok(timeline)
+        check_permission(claims, tenant_id)
    }
 }

@@ -971,17 +909,16 @@ where
            .auth
            .as_ref()
            .unwrap()
-            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)
-            .map_err(|e| QueryError::Unauthorized(e.0))?;
+            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;

        if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
-            return Err(QueryError::Unauthorized(
-                "jwt token scope is Tenant, but tenant id is missing".into(),
-            ));
+            return Err(QueryError::Other(anyhow::anyhow!(
+                "jwt token scope is Tenant, but tenant id is missing"
+            )));
        }

-        debug!(
-            "jwt scope check succeeded for scope: {:#?} by tenant id: {:?}",
+        info!(
+            "jwt auth succeeded for scope: {:#?} by tenant id: {:?}",
            data.claims.scope, data.claims.tenant_id,
        );

@@ -1003,13 +940,9 @@ where
        pgb: &mut PostgresBackend<IO>,
        query_string: &str,
    ) -> Result<(), QueryError> {
-        fail::fail_point!("simulated-bad-compute-connection", |_| {
-            info!("Hit failpoint for bad connection");
-            Err(QueryError::SimulatedConnectionError)
-        });
-
        let ctx = self.connection_ctx.attached_child();
        debug!("process query {query_string:?}");
+
        if query_string.starts_with("pagestream ") {
            let (_, params_raw) = query_string.split_at("pagestream ".len());
            let params = params_raw.split(' ').collect::<Vec<_>>();
@@ -1115,9 +1048,7 @@ where
                .record("timeline_id", field::display(timeline_id));

            self.check_permission(Some(tenant_id))?;
-            let timeline = self
-                .get_active_tenant_timeline(tenant_id, timeline_id)
-                .await?;
+            let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;

            let end_of_timeline = timeline.get_last_record_rlsn();

@@ -1301,12 +1232,7 @@ where

            self.check_permission(Some(tenant_id))?;

-            let tenant = get_active_tenant_with_timeout(
-                tenant_id,
-                ACTIVE_TENANT_TIMEOUT,
-                &task_mgr::shutdown_token(),
-            )
-            .await?;
+            let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
            pgb.write_message_noflush(&BeMessage::RowDescription(&[
                RowDescriptor::int8_col(b"checkpoint_distance"),
                RowDescriptor::int8_col(b"checkpoint_timeout"),
@@ -1352,16 +1278,67 @@ where
    }
 }

+#[derive(thiserror::Error, Debug)]
+enum GetActiveTenantError {
+    #[error(
+        "Timed out waiting {wait_time:?} for tenant active state. Latest state: {latest_state:?}"
+    )]
+    WaitForActiveTimeout {
+        latest_state: TenantState,
+        wait_time: Duration,
+    },
+    #[error(transparent)]
+    NotFound(GetTenantError),
+    #[error(transparent)]
+    WaitTenantActive(tenant::WaitToBecomeActiveError),
+}
+
 impl From<GetActiveTenantError> for QueryError {
    fn from(e: GetActiveTenantError) -> Self {
        match e {
            GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
                ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
            ),
-            GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
-                QueryError::Shutdown
+            GetActiveTenantError::WaitTenantActive(e) => QueryError::Other(anyhow::Error::new(e)),
+            GetActiveTenantError::NotFound(e) => QueryError::Other(anyhow::Error::new(e)),
+        }
+    }
+}
+
+/// Get active tenant.
+///
+/// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
+/// ensures that queries don't fail immediately after pageserver startup, because
+/// all tenants are still loading.
+async fn get_active_tenant_with_timeout(
+    tenant_id: TenantId,
+    _ctx: &RequestContext, /* require get a context to support cancellation in the future */
+) -> Result<Arc<Tenant>, GetActiveTenantError> {
+    let tenant = match mgr::get_tenant(tenant_id, false).await {
+        Ok(tenant) => tenant,
+        Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
+        Err(GetTenantError::NotActive(_)) => {
+            unreachable!("we're calling get_tenant with active_only=false")
+        }
+        Err(GetTenantError::Broken(_)) => {
+            unreachable!("we're calling get_tenant with active_only=false")
+        }
+    };
+    let wait_time = Duration::from_secs(30);
+    match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
+        Ok(Ok(())) => Ok(tenant),
+        // no .context(), the error message is good enough and some tests depend on it
+        Ok(Err(e)) => Err(GetActiveTenantError::WaitTenantActive(e)),
+        Err(_) => {
+            let latest_state = tenant.current_state();
+            if latest_state == TenantState::Active {
+                Ok(tenant)
+            } else {
+                Err(GetActiveTenantError::WaitForActiveTimeout {
+                    latest_state,
+                    wait_time,
+                })
            }
-            e => QueryError::Other(anyhow::anyhow!(e)),
        }
    }
 }
@@ -1382,3 +1359,18 @@ impl From<GetActiveTimelineError> for QueryError {
        }
    }
 }
+
+/// Shorthand for getting a reference to a Timeline of an Active tenant.
+async fn get_active_tenant_timeline(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    ctx: &RequestContext,
+) -> Result<Arc<Timeline>, GetActiveTimelineError> {
+    let tenant = get_active_tenant_with_timeout(tenant_id, ctx)
+        .await
+        .map_err(GetActiveTimelineError::Tenant)?;
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
+        .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
+    Ok(timeline)
+}
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -21,6 +21,7 @@ use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::ControlFlow;
 use std::ops::Range;
+use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
 use utils::{bin_ser::BeSer, lsn::Lsn};

@@ -43,17 +44,6 @@ pub enum CalculateLogicalSizeError {
    Other(#[from] anyhow::Error),
 }

-impl From<PageReconstructError> for CalculateLogicalSizeError {
-    fn from(pre: PageReconstructError) -> Self {
-        match pre {
-            PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => {
-                Self::Cancelled
-            }
-            _ => Self::Other(pre.into()),
-        }
-    }
-}
-
 #[derive(Debug, thiserror::Error)]
 pub enum RelationError {
    #[error("Relation Already Exists")]
@@ -577,22 +567,30 @@ impl Timeline {
    pub async fn get_current_logical_size_non_incremental(
        &self,
        lsn: Lsn,
+        cancel: CancellationToken,
        ctx: &RequestContext,
    ) -> Result<u64, CalculateLogicalSizeError> {
        crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();

        // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
+        let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?;
        let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;

        let mut total_size: u64 = 0;
        for (spcnode, dbnode) in dbdir.dbdirs.keys() {
-            for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
-                if self.cancel.is_cancelled() {
+            for rel in self
+                .list_rels(*spcnode, *dbnode, lsn, ctx)
+                .await
+                .context("list rels")?
+            {
+                if cancel.is_cancelled() {
                    return Err(CalculateLogicalSizeError::Cancelled);
                }
                let relsize_key = rel_size_to_key(rel);
-                let mut buf = self.get(relsize_key, lsn, ctx).await?;
+                let mut buf = self
+                    .get(relsize_key, lsn, ctx)
+                    .await
+                    .with_context(|| format!("read relation size of {rel:?}"))?;
                let relsize = buf.get_u32_le();

                total_size += relsize as u64;
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -299,6 +299,10 @@ pub enum TaskKind {

 #[derive(Default)]
 struct MutableTaskState {
+    /// Tenant and timeline that this task is associated with.
+    tenant_id: Option<TenantId>,
+    timeline_id: Option<TimelineId>,
+
    /// Handle for waiting for the task to exit. It can be None, if the
    /// the task has already exited.
    join_handle: Option<JoinHandle<()>>,
@@ -315,11 +319,6 @@ struct PageServerTask {
    // To request task shutdown, just cancel this token.
    cancel: CancellationToken,

-    /// Tasks may optionally be launched for a particular tenant/timeline, enabling
-    /// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
-    tenant_id: Option<TenantId>,
-    timeline_id: Option<TimelineId>,
-
    mutable: Mutex<MutableTaskState>,
 }

@@ -345,9 +344,11 @@ where
        kind,
        name: name.to_string(),
        cancel: cancel.clone(),
-        tenant_id,
-        timeline_id,
-        mutable: Mutex::new(MutableTaskState { join_handle: None }),
+        mutable: Mutex::new(MutableTaskState {
+            tenant_id,
+            timeline_id,
+            join_handle: None,
+        }),
    });

    TASKS.lock().unwrap().insert(task_id, Arc::clone(&task));
@@ -417,6 +418,8 @@ async fn task_finish(

    let mut shutdown_process = false;
    {
+        let task_mut = task.mutable.lock().unwrap();
+
        match result {
            Ok(Ok(())) => {
                debug!("Task '{}' exited normally", task_name);
@@ -425,13 +428,13 @@ async fn task_finish(
                if shutdown_process_on_error {
                    error!(
                        "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
-                        task_name, task.tenant_id, task.timeline_id, err
+                        task_name, task_mut.tenant_id, task_mut.timeline_id, err
                    );
                    shutdown_process = true;
                } else {
                    error!(
                        "Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
-                        task_name, task.tenant_id, task.timeline_id, err
+                        task_name, task_mut.tenant_id, task_mut.timeline_id, err
                    );
                }
            }
@@ -439,13 +442,13 @@ async fn task_finish(
                if shutdown_process_on_error {
                    error!(
                        "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
-                        task_name, task.tenant_id, task.timeline_id, err
+                        task_name, task_mut.tenant_id, task_mut.timeline_id, err
                    );
                    shutdown_process = true;
                } else {
                    error!(
                        "Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
-                        task_name, task.tenant_id, task.timeline_id, err
+                        task_name, task_mut.tenant_id, task_mut.timeline_id, err
                    );
                }
            }
@@ -457,6 +460,17 @@ async fn task_finish(
    }
 }

+// expected to be called from the task of the given id.
+pub fn associate_with(tenant_id: Option<TenantId>, timeline_id: Option<TimelineId>) {
+    CURRENT_TASK.with(|ct| {
+        let mut task_mut = ct.mutable.lock().unwrap();
+        task_mut.tenant_id = tenant_id;
+        task_mut.timeline_id = timeline_id;
+    });
+}
+
+/// Is there a task running that matches the criteria
+
 /// Signal and wait for tasks to shut down.
 ///
 ///
@@ -479,16 +493,17 @@ pub async fn shutdown_tasks(
    {
        let tasks = TASKS.lock().unwrap();
        for task in tasks.values() {
+            let task_mut = task.mutable.lock().unwrap();
            if (kind.is_none() || Some(task.kind) == kind)
-                && (tenant_id.is_none() || task.tenant_id == tenant_id)
-                && (timeline_id.is_none() || task.timeline_id == timeline_id)
+                && (tenant_id.is_none() || task_mut.tenant_id == tenant_id)
+                && (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
            {
                task.cancel.cancel();
                victim_tasks.push((
                    Arc::clone(task),
                    task.kind,
-                    task.tenant_id,
-                    task.timeline_id,
+                    task_mut.tenant_id,
+                    task_mut.timeline_id,
                ));
            }
        }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -26,7 +26,6 @@ use tracing::*;
 use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::fs_ext;
-use utils::sync::gate::Gate;

 use std::cmp::min;
 use std::collections::hash_map::Entry;
@@ -55,8 +54,6 @@ use self::config::TenantConf;
 use self::delete::DeleteTenantFlow;
 use self::metadata::LoadMetadataError;
 use self::metadata::TimelineMetadata;
-use self::mgr::GetActiveTenantError;
-use self::mgr::GetTenantError;
 use self::mgr::TenantsMap;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineUninitMark;
@@ -255,20 +252,6 @@ pub struct Tenant {
    eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,

    pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
-
-    // Cancellation token fires when we have entered shutdown().  This is a parent of
-    // Timelines' cancellation token.
-    pub(crate) cancel: CancellationToken,
-
-    // Users of the Tenant such as the page service must take this Gate to avoid
-    // trying to use a Tenant which is shutting down.
-    pub(crate) gate: Gate,
-}
-
-impl std::fmt::Debug for Tenant {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{} ({})", self.tenant_id, self.current_state())
-    }
 }

 pub(crate) enum WalRedoManager {
@@ -376,6 +359,34 @@ impl Debug for SetStoppingError {
    }
 }

+#[derive(Debug, thiserror::Error)]
+pub(crate) enum WaitToBecomeActiveError {
+    WillNotBecomeActive {
+        tenant_id: TenantId,
+        state: TenantState,
+    },
+    TenantDropped {
+        tenant_id: TenantId,
+    },
+}
+
+impl std::fmt::Display for WaitToBecomeActiveError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            WaitToBecomeActiveError::WillNotBecomeActive { tenant_id, state } => {
+                write!(
+                    f,
+                    "Tenant {} will not become active. Current state: {:?}",
+                    tenant_id, state
+                )
+            }
+            WaitToBecomeActiveError::TenantDropped { tenant_id } => {
+                write!(f, "Tenant {tenant_id} will not become active (dropped)")
+            }
+        }
+    }
+}
+
 #[derive(thiserror::Error, Debug)]
 pub enum CreateTimelineError {
    #[error("a timeline with the given ID already exists")]
@@ -384,8 +395,6 @@ pub enum CreateTimelineError {
    AncestorLsn(anyhow::Error),
    #[error("ancestor timeline is not active")]
    AncestorNotActive,
-    #[error("tenant shutting down")]
-    ShuttingDown,
    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }
@@ -517,7 +526,7 @@ impl Tenant {
        resources: TenantSharedResources,
        attached_conf: AttachedTenantConf,
        init_order: Option<InitializationOrder>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        mode: SpawnMode,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Tenant>> {
@@ -1515,11 +1524,6 @@ impl Tenant {
            )));
        }

-        let _gate = self
-            .gate
-            .enter()
-            .map_err(|_| CreateTimelineError::ShuttingDown)?;
-
        if let Ok(existing) = self.get_timeline(new_timeline_id, false) {
            debug!("timeline {new_timeline_id} already exists");

@@ -1804,7 +1808,6 @@ impl Tenant {
        freeze_and_flush: bool,
    ) -> Result<(), completion::Barrier> {
        span::debug_assert_current_span_has_tenant_id();
-
        // Set tenant (and its timlines) to Stoppping state.
        //
        // Since we can only transition into Stopping state after activation is complete,
@@ -1830,7 +1833,6 @@ impl Tenant {
            }
            Err(SetStoppingError::AlreadyStopping(other)) => {
                // give caller the option to wait for this this shutdown
-                info!("Tenant::shutdown: AlreadyStopping");
                return Err(other);
            }
        };
@@ -1841,16 +1843,9 @@ impl Tenant {
            timelines.values().for_each(|timeline| {
                let timeline = Arc::clone(timeline);
                let span = Span::current();
-                js.spawn(async move {
-                    if freeze_and_flush {
-                        timeline.flush_and_shutdown().instrument(span).await
-                    } else {
-                        timeline.shutdown().instrument(span).await
-                    }
-                });
+                js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await });
            })
        };
-        tracing::info!("Waiting for timelines...");
        while let Some(res) = js.join_next().await {
            match res {
                Ok(()) => {}
@@ -1860,21 +1855,12 @@ impl Tenant {
            }
        }

-        // We cancel the Tenant's cancellation token _after_ the timelines have all shut down.  This permits
-        // them to continue to do work during their shutdown methods, e.g. flushing data.
-        tracing::debug!("Cancelling CancellationToken");
-        self.cancel.cancel();
-
        // shutdown all tenant and timeline tasks: gc, compaction, page service
        // No new tasks will be started for this tenant because it's in `Stopping` state.
        //
        // this will additionally shutdown and await all timeline tasks.
-        tracing::debug!("Waiting for tasks...");
        task_mgr::shutdown_tasks(None, Some(self.tenant_id), None).await;

-        // Wait for any in-flight operations to complete
-        self.gate.close().await;
-
        Ok(())
    }

@@ -2035,7 +2021,7 @@ impl Tenant {
        self.state.subscribe()
    }

-    pub(crate) async fn wait_to_become_active(&self) -> Result<(), GetActiveTenantError> {
+    pub(crate) async fn wait_to_become_active(&self) -> Result<(), WaitToBecomeActiveError> {
        let mut receiver = self.state.subscribe();
        loop {
            let current_state = receiver.borrow_and_update().clone();
@@ -2043,9 +2029,11 @@ impl Tenant {
                TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => {
                    // in these states, there's a chance that we can reach ::Active
                    receiver.changed().await.map_err(
-                        |_e: tokio::sync::watch::error::RecvError|
-                            // Tenant existed but was dropped: report it as non-existent
-                            GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_id))
+                        |_e: tokio::sync::watch::error::RecvError| {
+                            WaitToBecomeActiveError::TenantDropped {
+                                tenant_id: self.tenant_id,
+                            }
+                        },
                    )?;
                }
                TenantState::Active { .. } => {
@@ -2053,7 +2041,10 @@ impl Tenant {
                }
                TenantState::Broken { .. } | TenantState::Stopping { .. } => {
                    // There's no chance the tenant can transition back into ::Active
-                    return Err(GetActiveTenantError::WillNotBecomeActive(current_state));
+                    return Err(WaitToBecomeActiveError::WillNotBecomeActive {
+                        tenant_id: self.tenant_id,
+                        state: current_state,
+                    });
                }
            }
        }
@@ -2119,9 +2110,6 @@ where
 }

 impl Tenant {
-    pub fn get_tenant_id(&self) -> TenantId {
-        self.tenant_id
-    }
    pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
        self.tenant_conf.read().unwrap().tenant_conf
    }
@@ -2279,7 +2267,6 @@ impl Tenant {
            initial_logical_size_can_start.cloned(),
            initial_logical_size_attempt.cloned().flatten(),
            state,
-            self.cancel.child_token(),
        );

        Ok(timeline)
@@ -2369,8 +2356,6 @@ impl Tenant {
            cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
            eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
            delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
-            cancel: CancellationToken::default(),
-            gate: Gate::new(format!("Tenant<{tenant_id}>")),
        }
    }

@@ -3534,6 +3519,10 @@ pub(crate) mod harness {
            let remote_fs_dir = conf.workdir.join("localfs");
            std::fs::create_dir_all(&remote_fs_dir).unwrap();
            let config = RemoteStorageConfig {
+                // TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
+                max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
+                // TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
+                max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
                storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
            };
            let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
@@ -3703,7 +3692,7 @@ mod tests {
    use tokio_util::sync::CancellationToken;

    static TEST_KEY: Lazy<Key> =
-        Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
+        Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001")));

    #[tokio::test]
    async fn test_basic() -> anyhow::Result<()> {
@@ -3799,9 +3788,9 @@ mod tests {
        let writer = tline.writer().await;

        #[allow(non_snake_case)]
-        let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap();
+        let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap();
        #[allow(non_snake_case)]
-        let TEST_KEY_B: Key = Key::from_hex("110000000033333333444444445500000002").unwrap();
+        let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap();

        // Insert a value on the timeline
        writer
@@ -4247,7 +4236,11 @@ mod tests {
        metadata_bytes[8] ^= 1;
        std::fs::write(metadata_path, metadata_bytes)?;

-        let err = harness.try_load_local(&ctx).await.expect_err("should fail");
+        let err = harness
+            .try_load_local(&ctx)
+            .await
+            .err()
+            .expect("should fail");
        // get all the stack with all .context, not only the last one
        let message = format!("{err:#}");
        let expected = "failed to load metadata";
@@ -4381,7 +4374,7 @@ mod tests {

        let mut keyspace = KeySpaceAccum::new();

-        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+        let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap();
        let mut blknum = 0;
        for _ in 0..50 {
            for _ in 0..10000 {
@@ -4427,7 +4420,7 @@ mod tests {

        const NUM_KEYS: usize = 1000;

-        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+        let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap();

        let mut keyspace = KeySpaceAccum::new();

@@ -4508,7 +4501,7 @@ mod tests {

        const NUM_KEYS: usize = 1000;

-        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+        let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap();

        let mut keyspace = KeySpaceAccum::new();

@@ -4599,7 +4592,7 @@ mod tests {
        const NUM_KEYS: usize = 100;
        const NUM_TLINES: usize = 50;

-        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+        let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap();
        // Track page mutation lsns across different timelines.
        let mut updated = [[Lsn(0); NUM_KEYS]; NUM_TLINES];

@@ -4653,6 +4646,74 @@ mod tests {
        Ok(())
    }

+    #[tokio::test]
+    async fn test_write_at_initdb_lsn_takes_optimization_code_path() -> anyhow::Result<()> {
+        let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable")?
+            .load()
+            .await;
+
+        let initdb_lsn = Lsn(0x20);
+        let utline = tenant
+            .create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx)
+            .await?;
+        let tline = utline.raw_timeline().unwrap();
+
+        // Spawn flush loop now so that we can set the `expect_initdb_optimization`
+        tline.maybe_spawn_flush_loop();
+
+        // Make sure the timeline has the minimum set of required keys for operation.
+        // The only operation you can always do on an empty timeline is to `put` new data.
+        // Except if you `put` at `initdb_lsn`.
+        // In that case, there's an optimization to directly create image layers instead of delta layers.
+        // It uses `repartition()`, which assumes some keys to be present.
+        // Let's make sure the test timeline can handle that case.
+        {
+            let mut state = tline.flush_loop_state.lock().unwrap();
+            assert_eq!(
+                timeline::FlushLoopState::Running {
+                    expect_initdb_optimization: false,
+                    initdb_optimization_count: 0,
+                },
+                *state
+            );
+            *state = timeline::FlushLoopState::Running {
+                expect_initdb_optimization: true,
+                initdb_optimization_count: 0,
+            };
+        }
+
+        // Make writes at the initdb_lsn. When we flush it below, it should be handled by the optimization.
+        // As explained above, the optimization requires some keys to be present.
+        // As per `create_empty_timeline` documentation, use init_empty to set them.
+        // This is what `create_test_timeline` does, by the way.
+        let mut modification = tline.begin_modification(initdb_lsn);
+        modification
+            .init_empty_test_timeline()
+            .context("init_empty_test_timeline")?;
+        modification
+            .commit(&ctx)
+            .await
+            .context("commit init_empty_test_timeline modification")?;
+
+        // Do the flush. The flush code will check the expectations that we set above.
+        tline.freeze_and_flush().await?;
+
+        // assert freeze_and_flush exercised the initdb optimization
+        {
+            let state = tline.flush_loop_state.lock().unwrap();
+            let timeline::FlushLoopState::Running {
+                expect_initdb_optimization,
+                initdb_optimization_count,
+            } = *state
+            else {
+                panic!("unexpected state: {:?}", *state);
+            };
+            assert!(expect_initdb_optimization);
+            assert!(initdb_optimization_count > 0);
+        }
+        Ok(())
+    }
+
    #[tokio::test]
    async fn test_uninit_mark_crash() -> anyhow::Result<()> {
        let name = "test_uninit_mark_crash";
@@ -4665,7 +4726,7 @@ mod tests {
            // Keeps uninit mark in place
            let raw_tline = tline.raw_timeline().unwrap();
            raw_tline
-                .shutdown()
+                .shutdown(false)
                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_id))
                .await;
            std::mem::forget(tline);
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -327,7 +327,7 @@ mod tests {
                let mut sz: u16 = rng.gen();
                // Make 50% of the arrays small
                if rng.gen() {
-                    sz &= 63;
+                    sz |= 63;
                }
                random_array(sz.into())
            })
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -21,7 +21,7 @@ use crate::{
 };

 use super::{
-    mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
+    mgr::{GetTenantError, TenantsMap},
    remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
    span,
    timeline::delete::DeleteTimelineFlow,
@@ -33,21 +33,12 @@ pub(crate) enum DeleteTenantError {
    #[error("GetTenant {0}")]
    Get(#[from] GetTenantError),

-    #[error("Tenant not attached")]
-    NotAttached,
-
    #[error("Invalid state {0}. Expected Active or Broken")]
    InvalidState(TenantState),

    #[error("Tenant deletion is already in progress")]
    AlreadyInProgress,

-    #[error("Tenant map slot error {0}")]
-    SlotError(#[from] TenantSlotError),
-
-    #[error("Tenant map slot upsert error {0}")]
-    SlotUpsertError(#[from] TenantSlotUpsertError),
-
    #[error("Timeline {0}")]
    Timeline(#[from] DeleteTimelineError),

@@ -282,12 +273,12 @@ impl DeleteTenantFlow {
    pub(crate) async fn run(
        conf: &'static PageServerConf,
        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
-        tenant: Arc<Tenant>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenant_id: TenantId,
    ) -> Result<(), DeleteTenantError> {
        span::debug_assert_current_span_has_tenant_id();

-        let mut guard = Self::prepare(&tenant).await?;
+        let (tenant, mut guard) = Self::prepare(tenants, tenant_id).await?;

        if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
            tenant.set_broken(format!("{e:#}")).await;
@@ -387,7 +378,7 @@ impl DeleteTenantFlow {
        guard: DeletionGuard,
        tenant: &Arc<Tenant>,
        preload: Option<TenantPreload>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        init_order: Option<InitializationOrder>,
        ctx: &RequestContext,
    ) -> Result<(), DeleteTenantError> {
@@ -414,8 +405,15 @@ impl DeleteTenantFlow {
    }

    async fn prepare(
-        tenant: &Arc<Tenant>,
-    ) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
+        tenants: &tokio::sync::RwLock<TenantsMap>,
+        tenant_id: TenantId,
+    ) -> Result<(Arc<Tenant>, tokio::sync::OwnedMutexGuard<Self>), DeleteTenantError> {
+        let m = tenants.read().await;
+
+        let tenant = m
+            .get(&tenant_id)
+            .ok_or(GetTenantError::NotFound(tenant_id))?;
+
        // FIXME: unsure about active only. Our init jobs may not be cancellable properly,
        // so at least for now allow deletions only for active tenants. TODO recheck
        // Broken and Stopping is needed for retries.
@@ -449,14 +447,14 @@ impl DeleteTenantFlow {
            )));
        }

-        Ok(guard)
+        Ok((Arc::clone(tenant), guard))
    }

    fn schedule_background(
        guard: OwnedMutexGuard<Self>,
        conf: &'static PageServerConf,
        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        tenant: Arc<Tenant>,
    ) {
        let tenant_id = tenant.tenant_id;
@@ -489,7 +487,7 @@ impl DeleteTenantFlow {
        mut guard: OwnedMutexGuard<Self>,
        conf: &PageServerConf,
        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        tenant: &Arc<Tenant>,
    ) -> Result<(), DeleteTenantError> {
        // Tree sort timelines, schedule delete for them. Mention retries from the console side.
@@ -537,18 +535,10 @@ impl DeleteTenantFlow {
            .await
            .context("cleanup_remaining_fs_traces")?;

-        {
-            let mut locked = tenants.write().unwrap();
-            if locked.remove(&tenant.tenant_id).is_none() {
-                warn!("Tenant got removed from tenants map during deletion");
-            };
-
-            // FIXME: we should not be modifying this from outside of mgr.rs.
-            // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
-            crate::metrics::TENANT_MANAGER
-                .tenant_slots
-                .set(locked.len() as u64);
-        }
+        let mut locked = tenants.write().await;
+        if locked.remove(&tenant.tenant_id).is_none() {
+            warn!("Tenant got removed from tenants map during deletion");
+        };

        *guard = Self::Finished;

--- a/pageserver/src/tenant/disk_btree_test_data.rs
+++ b/pageserver/src/tenant/disk_btree_test_data.rs
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -406,123 +406,4 @@ mod tests {
            METADATA_OLD_FORMAT_VERSION, METADATA_FORMAT_VERSION
        );
    }
-
-    #[test]
-    fn test_metadata_bincode_serde() {
-        let original_metadata = TimelineMetadata::new(
-            Lsn(0x200),
-            Some(Lsn(0x100)),
-            Some(TIMELINE_ID),
-            Lsn(0),
-            Lsn(0),
-            Lsn(0),
-            // Any version will do here, so use the default
-            crate::DEFAULT_PG_VERSION,
-        );
-        let metadata_bytes = original_metadata
-            .to_bytes()
-            .expect("Cannot create bytes array from metadata");
-
-        let metadata_bincode_be_bytes = original_metadata
-            .ser()
-            .expect("Cannot serialize the metadata");
-
-        // 8 bytes for the length of the vector
-        assert_eq!(metadata_bincode_be_bytes.len(), 8 + metadata_bytes.len());
-
-        let expected_bincode_bytes = {
-            let mut temp = vec![];
-            let len_bytes = metadata_bytes.len().to_be_bytes();
-            temp.extend_from_slice(&len_bytes);
-            temp.extend_from_slice(&metadata_bytes);
-            temp
-        };
-        assert_eq!(metadata_bincode_be_bytes, expected_bincode_bytes);
-
-        let deserialized_metadata = TimelineMetadata::des(&metadata_bincode_be_bytes).unwrap();
-        // Deserialized metadata has the metadata header, which is different from the serialized one.
-        //   Reference: TimelineMetaData::to_bytes()
-        let expected_metadata = {
-            let mut temp_metadata = original_metadata;
-            let body_bytes = temp_metadata
-                .body
-                .ser()
-                .expect("Cannot serialize the metadata body");
-            let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
-            let hdr = TimelineMetadataHeader {
-                size: metadata_size as u16,
-                format_version: METADATA_FORMAT_VERSION,
-                checksum: crc32c::crc32c(&body_bytes),
-            };
-            temp_metadata.hdr = hdr;
-            temp_metadata
-        };
-        assert_eq!(deserialized_metadata, expected_metadata);
-    }
-
-    #[test]
-    fn test_metadata_bincode_serde_ensure_roundtrip() {
-        let original_metadata = TimelineMetadata::new(
-            Lsn(0x200),
-            Some(Lsn(0x100)),
-            Some(TIMELINE_ID),
-            Lsn(0),
-            Lsn(0),
-            Lsn(0),
-            // Any version will do here, so use the default
-            crate::DEFAULT_PG_VERSION,
-        );
-        let expected_bytes = vec![
-            /* bincode length encoding bytes */
-            0, 0, 0, 0, 0, 0, 2, 0, // 8 bytes for the length of the serialized vector
-            /* TimelineMetadataHeader */
-            4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
-            /* TimelineMetadataBodyV2 */
-            0, 0, 0, 0, 0, 0, 2, 0, // disk_consistent_lsn (8 bytes)
-            1, 0, 0, 0, 0, 0, 0, 1, 0, // prev_record_lsn (9 bytes)
-            1, 17, 34, 51, 68, 85, 102, 119, 136, 17, 34, 51, 68, 85, 102, 119,
-            136, // ancestor_timeline (17 bytes)
-            0, 0, 0, 0, 0, 0, 0, 0, // ancestor_lsn (8 bytes)
-            0, 0, 0, 0, 0, 0, 0, 0, // latest_gc_cutoff_lsn (8 bytes)
-            0, 0, 0, 0, 0, 0, 0, 0, // initdb_lsn (8 bytes)
-            0, 0, 0, 15, // pg_version (4 bytes)
-            /* padding bytes */
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0,
-        ];
-        let metadata_ser_bytes = original_metadata.ser().unwrap();
-        assert_eq!(metadata_ser_bytes, expected_bytes);
-
-        let expected_metadata = {
-            let mut temp_metadata = original_metadata;
-            let body_bytes = temp_metadata
-                .body
-                .ser()
-                .expect("Cannot serialize the metadata body");
-            let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
-            let hdr = TimelineMetadataHeader {
-                size: metadata_size as u16,
-                format_version: METADATA_FORMAT_VERSION,
-                checksum: crc32c::crc32c(&body_bytes),
-            };
-            temp_metadata.hdr = hdr;
-            temp_metadata
-        };
-        let des_metadata = TimelineMetadata::des(&metadata_ser_bytes).unwrap();
-        assert_eq!(des_metadata, expected_metadata);
-    }
 }
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1542,7 +1542,7 @@ pub fn remote_index_path(
 }

 /// Given the key of an index, parse out the generation part of the name
-pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
+pub(crate) fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
    let file_name = match path.get_path().file_name() {
        Some(f) => f,
        None => {
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -6,6 +6,7 @@ use std::collections::HashMap;

 use chrono::NaiveDateTime;
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use utils::bin_ser::SerializeError;

 use crate::tenant::metadata::TimelineMetadata;
@@ -57,6 +58,7 @@ impl LayerFileMetadata {
 ///
 /// This type needs to be backwards and forwards compatible. When changing the fields,
 /// remember to add a test case for the changed version.
+#[serde_as]
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub struct IndexPart {
    /// Debugging aid describing the version of this type.
@@ -76,6 +78,7 @@ pub struct IndexPart {
    // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
    // It's duplicated for convenience when reading the serialized structure, but is
    // private because internally we would read from metadata instead.
+    #[serde_as(as = "DisplayFromStr")]
    disk_consistent_lsn: Lsn,

    #[serde(rename = "metadata_bytes")]
@@ -152,7 +155,7 @@ pub struct IndexLayerMetadata {

    #[serde(default = "Generation::none")]
    #[serde(skip_serializing_if = "Generation::is_none")]
-    pub generation: Generation,
+    pub(super) generation: Generation,
 }

 impl From<LayerFileMetadata> for IndexLayerMetadata {
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -6,6 +6,7 @@ use std::sync::Arc;
 use anyhow::{bail, Context};
 use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
+use tokio_util::sync::CancellationToken;

 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
@@ -28,6 +29,7 @@ use tenant_size_model::{Segment, StorageModel};
 /// needs. We will convert this into a StorageModel when it's time to perform
 /// the calculation.
 ///
+#[serde_with::serde_as]
 #[derive(Debug, serde::Serialize, serde::Deserialize)]
 pub struct ModelInputs {
    pub segments: Vec<SegmentMeta>,
@@ -35,9 +37,11 @@ pub struct ModelInputs {
 }

 /// A [`Segment`], with some extra information for display purposes
+#[serde_with::serde_as]
 #[derive(Debug, serde::Serialize, serde::Deserialize)]
 pub struct SegmentMeta {
    pub segment: Segment,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub timeline_id: TimelineId,
    pub kind: LsnKind,
 }
@@ -73,22 +77,32 @@ pub enum LsnKind {

 /// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as
 /// part of [`ModelInputs`] from the HTTP api, explaining the inputs.
+#[serde_with::serde_as]
 #[derive(Debug, serde::Serialize, serde::Deserialize)]
 pub struct TimelineInputs {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub timeline_id: TimelineId,

+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
    pub ancestor_id: Option<TimelineId>,

+    #[serde_as(as = "serde_with::DisplayFromStr")]
    ancestor_lsn: Lsn,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    last_record: Lsn,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    latest_gc_cutoff: Lsn,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    horizon_cutoff: Lsn,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    pitr_cutoff: Lsn,

    /// Cutoff point based on GC settings
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    next_gc_cutoff: Lsn,

    /// Cutoff point calculated from the user-supplied 'max_retention_period'
+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
    retention_param_cutoff: Option<Lsn>,
 }

@@ -349,6 +363,10 @@ async fn fill_logical_sizes(
    // our advantage with `?` error handling.
    let mut joinset = tokio::task::JoinSet::new();

+    let cancel = tokio_util::sync::CancellationToken::new();
+    // be sure to cancel all spawned tasks if we are dropped
+    let _dg = cancel.clone().drop_guard();
+
    // For each point that would benefit from having a logical size available,
    // spawn a Task to fetch it, unless we have it cached already.
    for seg in segments.iter() {
@@ -366,8 +384,15 @@ async fn fill_logical_sizes(
                let parallel_size_calcs = Arc::clone(limit);
                let ctx = ctx.attached_child();
                joinset.spawn(
-                    calculate_logical_size(parallel_size_calcs, timeline, lsn, cause, ctx)
-                        .in_current_span(),
+                    calculate_logical_size(
+                        parallel_size_calcs,
+                        timeline,
+                        lsn,
+                        cause,
+                        ctx,
+                        cancel.child_token(),
+                    )
+                    .in_current_span(),
                );
            }
            e.insert(cached_size);
@@ -394,12 +419,10 @@ async fn fill_logical_sizes(
                have_any_error = true;
            }
            Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => {
-                if !matches!(error, CalculateLogicalSizeError::Cancelled) {
-                    warn!(
-                        timeline_id=%timeline.timeline_id,
-                        "failed to calculate logical size at {lsn}: {error:#}"
-                    );
-                }
+                warn!(
+                    timeline_id=%timeline.timeline_id,
+                    "failed to calculate logical size at {lsn}: {error:#}"
+                );
                have_any_error = true;
            }
            Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => {
@@ -475,13 +498,14 @@ async fn calculate_logical_size(
    lsn: utils::lsn::Lsn,
    cause: LogicalSizeCalculationCause,
    ctx: RequestContext,
+    cancel: CancellationToken,
 ) -> Result<TimelineAtLsnSizeResult, RecvError> {
    let _permit = tokio::sync::Semaphore::acquire_owned(limit)
        .await
        .expect("global semaphore should not had been closed");

    let size_res = timeline
-        .spawn_ondemand_logical_size_calculation(lsn, cause, ctx)
+        .spawn_ondemand_logical_size_calculation(lsn, cause, ctx, cancel)
        .instrument(info_span!("spawn_ondemand_logical_size_calculation"))
        .await?;
    Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -4,6 +4,7 @@ pub mod delta_layer;
 mod filename;
 mod image_layer;
 mod inmemory_layer;
+mod inmemory_layer_raw;
 mod layer;
 mod layer_desc;

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -345,19 +345,14 @@ impl InMemoryLayer {

        let cursor = inner.file.block_cursor();

-        // Sort the keys because delta layer writer expects them sorted.
-        //
-        // NOTE: this sort can take up significant time if the layer has millions of
-        //       keys. To speed up all the comparisons we convert the key to i128 and
-        //       keep the value as a reference.
-        let mut keys: Vec<_> = inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect();
-        keys.sort_unstable_by_key(|k| k.0);
+        let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
+        keys.sort_by_key(|k| k.0);

        let ctx = RequestContextBuilder::extend(ctx)
            .page_content_kind(PageContentKind::InMemoryLayer)
            .build();
        for (key, vec_map) in keys.iter() {
-            let key = Key::from_i128(*key);
+            let key = **key;
            // Write all page versions
            for (lsn, pos) in vec_map.as_slice() {
                cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
@@ -372,4 +367,61 @@ impl InMemoryLayer {
        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
        Ok(delta_layer)
    }
+
+    /// Write this frozen in-memory layer to disk.
+    ///
+    /// Returns a new delta layer with all the same data as this in-memory layer
+    pub async fn write_to_disk_bench(
+        &self,
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        // Grab the lock in read-mode. We hold it over the I/O, but because this
+        // layer is not writeable anymore, no one should be trying to acquire the
+        // write lock on it, so we shouldn't block anyone. There's one exception
+        // though: another thread might have grabbed a reference to this layer
+        // in `get_layer_for_write' just before the checkpointer called
+        // `freeze`, and then `write_to_disk` on it. When the thread gets the
+        // lock, it will see that it's not writeable anymore and retry, but it
+        // would have to wait until we release it. That race condition is very
+        // rare though, so we just accept the potential latency hit for now.
+        let inner = self.inner.read().await;
+
+        let end_lsn = *self.end_lsn.get().unwrap();
+
+        let mut delta_layer_writer = DeltaLayerWriter::new(
+            self.conf,
+            self.timeline_id,
+            self.tenant_id,
+            Key::MIN,
+            self.start_lsn..end_lsn,
+        )
+        .await?;
+
+        let mut buf = Vec::new();
+
+        let cursor = inner.file.block_cursor();
+
+        let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
+        keys.sort_by_key(|k| k.0);
+
+        let ctx = RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::InMemoryLayer)
+            .build();
+        for (key, vec_map) in keys.iter() {
+            let key = **key;
+            // Write all page versions
+            for (lsn, pos) in vec_map.as_slice() {
+                cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
+                let will_init = Value::des(&buf)?.will_init();
+                delta_layer_writer
+                    .put_value_bytes(key, *lsn, &buf, will_init)
+                    .await?;
+            }
+        }
+
+        // MAX is used here because we identify L0 layers by full key range
+        // TODO XXX do this
+        // let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
+        Ok(())
+    }
 }
--- a/pageserver/src/tenant/storage_layer/inmemory_layer_raw.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer_raw.rs
@@ -0,0 +1,23 @@
+
+
+pub struct InMemoryLayerRaw {
+}
+
+impl InMemoryLayerRaw {
+    pub async fn new() -> Self {
+        Self {
+
+        }
+    }
+
+    pub async fn put_value(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        val: &Value,
+        ctx: &RequestContext,
+    ) -> Result<()> {
+
+        Ok(())
+    }
+}
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -337,39 +337,31 @@ enum ResidentOrWantedEvicted {
 }

 impl ResidentOrWantedEvicted {
-    fn get_and_upgrade(&mut self) -> Option<(Arc<DownloadedLayer>, bool)> {
+    fn get(&self) -> Option<Arc<DownloadedLayer>> {
        match self {
-            ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)),
+            ResidentOrWantedEvicted::Resident(strong) => Some(strong.clone()),
            ResidentOrWantedEvicted::WantedEvicted(weak, _) => match weak.upgrade() {
                Some(strong) => {
                    LAYER_IMPL_METRICS.inc_raced_wanted_evicted_accesses();
-
-                    *self = ResidentOrWantedEvicted::Resident(strong.clone());
-
-                    Some((strong, true))
+                    Some(strong)
                }
                None => None,
            },
        }
    }
-
    /// When eviction is first requested, drop down to holding a [`Weak`].
    ///
-    /// Returns `Some` if this was the first time eviction was requested. Care should be taken to
-    /// drop the possibly last strong reference outside of the mutex of
-    /// heavier_once_cell::OnceCell.
-    fn downgrade(&mut self) -> Option<Arc<DownloadedLayer>> {
+    /// Returns `true` if this was the first time eviction was requested.
+    fn downgrade(&mut self) -> bool {
        match self {
            ResidentOrWantedEvicted::Resident(strong) => {
                let weak = Arc::downgrade(strong);
-                let mut temp = ResidentOrWantedEvicted::WantedEvicted(weak, strong.version);
-                std::mem::swap(self, &mut temp);
-                match temp {
-                    ResidentOrWantedEvicted::Resident(strong) => Some(strong),
-                    ResidentOrWantedEvicted::WantedEvicted(..) => unreachable!("just swapped"),
-                }
+                *self = ResidentOrWantedEvicted::WantedEvicted(weak, strong.version);
+                // returning the weak is not useful, because the drop could had already ran with
+                // the replacement above, and that will take care of cleaning the Option we are in
+                true
            }
-            ResidentOrWantedEvicted::WantedEvicted(..) => None,
+            ResidentOrWantedEvicted::WantedEvicted(..) => false,
        }
    }
 }
@@ -411,10 +403,6 @@ struct LayerInner {
    version: AtomicUsize,

    /// Allow subscribing to when the layer actually gets evicted.
-    ///
-    /// If in future we need to implement "wait until layer instances are gone and done", carrying
-    /// this over to the gc spawn_blocking from LayerInner::drop will do the trick, and adding a
-    /// method for "wait_gc" which will wait to this being closed.
    status: tokio::sync::broadcast::Sender<Status>,

    /// Counter for exponential backoff with the download
@@ -565,8 +553,6 @@ impl LayerInner {
        }
    }

-    /// Cancellation safe, however dropping the future and calling this method again might result
-    /// in a new attempt to evict OR join the previously started attempt.
    pub(crate) async fn evict_and_wait(
        &self,
        _: &RemoteTimelineClient,
@@ -577,22 +563,20 @@ impl LayerInner {

        let mut rx = self.status.subscribe();

-        let strong = {
-            match self.inner.get() {
-                Some(mut either) => {
-                    self.wanted_evicted.store(true, Ordering::Relaxed);
-                    either.downgrade()
-                }
-                None => return Err(EvictionError::NotFound),
-            }
-        };
+        let res =
+            self.wanted_evicted
+                .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);

-        if strong.is_some() {
-            // drop the DownloadedLayer outside of the holding the guard
-            drop(strong);
+        if res.is_ok() {
            LAYER_IMPL_METRICS.inc_started_evictions();
        }

+        if self.get().is_none() {
+            // it was not evictable in the first place
+            // our store to the wanted_evicted does not matter; it will be reset by next download
+            return Err(EvictionError::NotFound);
+        }
+
        match rx.recv().await {
            Ok(Status::Evicted) => Ok(()),
            Ok(Status::Downloaded) => Err(EvictionError::Downloaded),
@@ -606,8 +590,7 @@ impl LayerInner {
                //
                // use however late (compared to the initial expressing of wanted) as the
                // "outcome" now
-                LAYER_IMPL_METRICS.inc_broadcast_lagged();
-                match self.inner.get() {
+                match self.get() {
                    Some(_) => Err(EvictionError::Downloaded),
                    None => Ok(()),
                }
@@ -615,17 +598,15 @@ impl LayerInner {
        }
    }

-    /// Cancellation safe.
-    #[tracing::instrument(skip_all, fields(layer=%self))]
+    /// Should be cancellation safe, but cancellation is troublesome together with the spawned
+    /// download.
    async fn get_or_maybe_download(
        self: &Arc<Self>,
        allow_download: bool,
        ctx: Option<&RequestContext>,
    ) -> Result<Arc<DownloadedLayer>, DownloadError> {
-        let mut init_permit = None;
-
        loop {
-            let download = move |permit| async move {
+            let download = move || async move {
                // disable any scheduled but not yet running eviction deletions for this
                let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);

@@ -646,11 +627,7 @@ impl LayerInner {
                    .await
                    .map_err(DownloadError::PreStatFailed)?;

-                let permit = if let Some(reason) = needs_download {
-                    if let NeedsDownload::NotFile(ft) = reason {
-                        return Err(DownloadError::NotFile(ft));
-                    }
-
+                if let Some(reason) = needs_download {
                    // only reset this after we've decided we really need to download. otherwise it'd
                    // be impossible to mark cancelled downloads for eviction, like one could imagine
                    // we would like to do for prefetching which was not needed.
@@ -660,6 +637,8 @@ impl LayerInner {
                        return Err(DownloadError::NoRemoteStorage);
                    }

+                    tracing::debug!(%reason, "downloading layer");
+
                    if let Some(ctx) = ctx {
                        self.check_expected_download(ctx)?;
                    }
@@ -670,16 +649,12 @@ impl LayerInner {
                        return Err(DownloadError::DownloadRequired);
                    }

-                    tracing::info!(%reason, "downloading on-demand");
-
-                    self.spawn_download_and_wait(timeline, permit).await?
+                    self.spawn_download_and_wait(timeline).await?;
                } else {
                    // the file is present locally, probably by a previous but cancelled call to
                    // get_or_maybe_download. alternatively we might be running without remote storage.
                    LAYER_IMPL_METRICS.inc_init_needed_no_download();
-
-                    permit
-                };
+                }

                let res = Arc::new(DownloadedLayer {
                    owner: Arc::downgrade(self),
@@ -692,60 +667,19 @@ impl LayerInner {
                    LayerResidenceEventReason::ResidenceChange,
                );

-                let waiters = self.inner.initializer_count();
-                if waiters > 0 {
-                    tracing::info!(waiters, "completing the on-demand download for other tasks");
-                }
-
-                Ok((ResidentOrWantedEvicted::Resident(res), permit))
+                Ok(ResidentOrWantedEvicted::Resident(res))
            };

-            if let Some(init_permit) = init_permit.take() {
-                // use the already held initialization permit because it is impossible to hit the
-                // below paths anymore essentially limiting the max loop iterations to 2.
-                let (value, init_permit) = download(init_permit).await?;
-                let mut guard = self.inner.set(value, init_permit);
-                let (strong, _upgraded) = guard
-                    .get_and_upgrade()
-                    .expect("init creates strong reference, we held the init permit");
+            let locked = self.inner.get_or_init(download).await?;
+
+            if let Some(strong) = Self::get_or_apply_evictedness(Some(locked), &self.wanted_evicted)
+            {
                return Ok(strong);
            }

-            let (weak, permit) = {
-                let mut locked = self.inner.get_or_init(download).await?;
-
-                if let Some((strong, upgraded)) = locked.get_and_upgrade() {
-                    if upgraded {
-                        // when upgraded back, the Arc<DownloadedLayer> is still available, but
-                        // previously a `evict_and_wait` was received.
-                        self.wanted_evicted.store(false, Ordering::Relaxed);
-
-                        // error out any `evict_and_wait`
-                        drop(self.status.send(Status::Downloaded));
-                        LAYER_IMPL_METRICS
-                            .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);
-                    }
-
-                    return Ok(strong);
-                } else {
-                    // path to here: the evict_blocking is stuck on spawn_blocking queue.
-                    //
-                    // reset the contents, deactivating the eviction and causing a
-                    // EvictionCancelled::LostToDownload or EvictionCancelled::VersionCheckFailed.
-                    locked.take_and_deinit()
-                }
-            };
-
-            // unlock first, then drop the weak, but because upgrade failed, we
-            // know it cannot be a problem.
-
-            assert!(
-                matches!(weak, ResidentOrWantedEvicted::WantedEvicted(..)),
-                "unexpected {weak:?}, ResidentOrWantedEvicted::get_and_upgrade has a bug"
-            );
-
-            init_permit = Some(permit);
-
+            // the situation in which we might need to retry is that our init was ready
+            // immediatedly, but the DownloadedLayer had been dropped BUT failed to complete
+            // Self::evict_blocking
            LAYER_IMPL_METRICS.inc_retried_get_or_maybe_download();
        }
    }
@@ -757,8 +691,8 @@ impl LayerInner {
        match b {
            Download => Ok(()),
            Warn | Error => {
-                tracing::info!(
-                    "unexpectedly on-demand downloading for task kind {:?}",
+                tracing::warn!(
+                    "unexpectedly on-demand downloading remote layer {self} for task kind {:?}",
                    ctx.task_kind()
                );
                crate::metrics::UNEXPECTED_ONDEMAND_DOWNLOADS.inc();
@@ -780,17 +714,14 @@ impl LayerInner {
    async fn spawn_download_and_wait(
        self: &Arc<Self>,
        timeline: Arc<Timeline>,
-        permit: heavier_once_cell::InitPermit,
-    ) -> Result<heavier_once_cell::InitPermit, DownloadError> {
+    ) -> Result<(), DownloadError> {
        let task_name = format!("download layer {}", self);

        let (tx, rx) = tokio::sync::oneshot::channel();
-
        // this is sadly needed because of task_mgr::shutdown_tasks, otherwise we cannot
        // block tenant::mgr::remove_tenant_from_memory.

        let this: Arc<Self> = self.clone();
-
        crate::task_mgr::spawn(
            &tokio::runtime::Handle::current(),
            crate::task_mgr::TaskKind::RemoteDownloadTask,
@@ -799,7 +730,6 @@ impl LayerInner {
            &task_name,
            false,
            async move {
-
                let client = timeline
                    .remote_client
                    .as_ref()
@@ -821,9 +751,9 @@ impl LayerInner {
                    }
                };

-                if let Err(res) = tx.send((result, permit)) {
+                if let Err(res) = tx.send(result) {
                    match res {
-                        (Ok(()), _) => {
+                        Ok(()) => {
                            // our caller is cancellation safe so this is fine; if someone
                            // else requests the layer, they'll find it already downloaded
                            // or redownload.
@@ -834,7 +764,7 @@ impl LayerInner {
                            tracing::info!("layer file download completed after requester had cancelled");
                            LAYER_IMPL_METRICS.inc_download_completed_without_requester();
                        },
-                        (Err(e), _) => {
+                        Err(e) => {
                            // our caller is cancellation safe, but we might be racing with
                            // another attempt to initialize. before we have cancellation
                            // token support: these attempts should converge regardless of
@@ -850,7 +780,7 @@ impl LayerInner {
            .in_current_span(),
        );
        match rx.await {
-            Ok((Ok(()), permit)) => {
+            Ok(Ok(())) => {
                if let Some(reason) = self
                    .needs_download()
                    .await
@@ -861,12 +791,10 @@ impl LayerInner {
                }

                self.consecutive_failures.store(0, Ordering::Relaxed);
-                tracing::info!("on-demand download successful");

-                Ok(permit)
+                Ok(())
            }
-            Ok((Err(e), _permit)) => {
-                // FIXME: this should be with the spawned task and be cancellation sensitive
+            Ok(Err(e)) => {
                let consecutive_failures =
                    self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
                tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
@@ -884,6 +812,33 @@ impl LayerInner {
        }
    }

+    /// Access the current state without waiting for the file to be downloaded.
+    ///
+    /// Requires that we've initialized to state which is respective to the
+    /// actual residency state.
+    fn get(&self) -> Option<Arc<DownloadedLayer>> {
+        let locked = self.inner.get();
+        Self::get_or_apply_evictedness(locked, &self.wanted_evicted)
+    }
+
+    fn get_or_apply_evictedness(
+        guard: Option<heavier_once_cell::Guard<'_, ResidentOrWantedEvicted>>,
+        wanted_evicted: &AtomicBool,
+    ) -> Option<Arc<DownloadedLayer>> {
+        if let Some(mut x) = guard {
+            if let Some(won) = x.get() {
+                // there are no guarantees that we will always get to observe a concurrent call
+                // to evict
+                if wanted_evicted.load(Ordering::Acquire) {
+                    x.downgrade();
+                }
+                return Some(won);
+            }
+        }
+
+        None
+    }
+
    async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
        match tokio::fs::metadata(&self.path).await {
            Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
@@ -903,7 +858,7 @@ impl LayerInner {
    fn is_file_present_and_good_size(&self, m: &std::fs::Metadata) -> Result<(), NeedsDownload> {
        // in future, this should include sha2-256 validation of the file.
        if !m.is_file() {
-            Err(NeedsDownload::NotFile(m.file_type()))
+            Err(NeedsDownload::NotFile)
        } else if m.len() != self.desc.file_size {
            Err(NeedsDownload::WrongSize {
                actual: m.len(),
@@ -917,9 +872,7 @@ impl LayerInner {
    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
        let layer_file_name = self.desc.filename().file_name();

-        // this is not accurate: we could have the file locally but there was a cancellation
-        // and now we are not in sync, or we are currently downloading it.
-        let remote = self.inner.get().is_none();
+        let remote = self.get().is_none();

        let access_stats = self.access_stats.as_api_model(reset);

@@ -1054,14 +1007,11 @@ impl LayerInner {
                Ok(())
            }
            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
-                tracing::error!(
-                    layer_size = %self.desc.file_size,
-                    "failed to evict layer from disk, it was already gone (metrics will be inaccurate)"
-                );
+                tracing::info!("failed to evict file from disk, it was already gone");
                Err(EvictionCancelled::FileNotFound)
            }
            Err(e) => {
-                tracing::error!("failed to evict file from disk: {e:#}");
+                tracing::warn!("failed to evict file from disk: {e:#}");
                Err(EvictionCancelled::RemoveFailed)
            }
        };
@@ -1105,8 +1055,6 @@ enum DownloadError {
    ContextAndConfigReallyDeniesDownloads,
    #[error("downloading is really required but not allowed by this method")]
    DownloadRequired,
-    #[error("layer path exists, but it is not a file: {0:?}")]
-    NotFile(std::fs::FileType),
    /// Why no error here? Because it will be reported by page_service. We should had also done
    /// retries already.
    #[error("downloading evicted layer file failed")]
@@ -1122,7 +1070,7 @@ enum DownloadError {
 #[derive(Debug, PartialEq)]
 pub(crate) enum NeedsDownload {
    NotFound,
-    NotFile(std::fs::FileType),
+    NotFile,
    WrongSize { actual: u64, expected: u64 },
 }

@@ -1130,7 +1078,7 @@ impl std::fmt::Display for NeedsDownload {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            NeedsDownload::NotFound => write!(f, "file was not found"),
-            NeedsDownload::NotFile(ft) => write!(f, "path is not a file; {ft:?}"),
+            NeedsDownload::NotFile => write!(f, "path is not a file"),
            NeedsDownload::WrongSize { actual, expected } => {
                write!(f, "file size mismatch {actual} vs. {expected}")
            }
@@ -1141,8 +1089,6 @@ impl std::fmt::Display for NeedsDownload {
 /// Existence of `DownloadedLayer` means that we have the file locally, and can later evict it.
 pub(crate) struct DownloadedLayer {
    owner: Weak<LayerInner>,
-    // Use tokio OnceCell as we do not need to deinitialize this, it'll just get dropped with the
-    // DownloadedLayer
    kind: tokio::sync::OnceCell<anyhow::Result<LayerKind>>,
    version: usize,
 }
@@ -1186,6 +1132,7 @@ impl DownloadedLayer {
                "these are the same, just avoiding the upgrade"
            );

+            // there is nothing async here, but it should be async
            let res = if owner.desc.is_delta {
                let summary = Some(delta_layer::Summary::expected(
                    owner.desc.tenant_id,
@@ -1284,8 +1231,6 @@ impl std::fmt::Debug for ResidentLayer {

 impl ResidentLayer {
    /// Release the eviction guard, converting back into a plain [`Layer`].
-    ///
-    /// You can access the [`Layer`] also by using `as_ref`.
    pub(crate) fn drop_eviction_guard(self) -> Layer {
        self.into()
    }
@@ -1341,7 +1286,7 @@ impl AsRef<Layer> for ResidentLayer {
    }
 }

-/// Drop the eviction guard.
+/// Allow slimming down if we don't want the `2*usize` with eviction candidates?
 impl From<ResidentLayer> for Layer {
    fn from(value: ResidentLayer) -> Self {
        value.owner
@@ -1511,13 +1456,6 @@ impl LayerImplMetrics {
            .unwrap()
            .inc();
    }
-
-    fn inc_broadcast_lagged(&self) {
-        self.rare_counters
-            .get_metric_with_label_values(&["broadcast_lagged"])
-            .unwrap()
-            .inc();
-    }
 }

 enum EvictionCancelled {
@@ -1529,8 +1467,6 @@ enum EvictionCancelled {
    AlreadyReinitialized,
    /// Not evicted because of a pending reinitialization
    LostToDownload,
-    /// After eviction, there was a new layer access which cancelled the eviction.
-    UpgradedBackOnAccess,
 }

 impl EvictionCancelled {
@@ -1543,7 +1479,6 @@ impl EvictionCancelled {
            EvictionCancelled::RemoveFailed => "remove_failed",
            EvictionCancelled::AlreadyReinitialized => "already_reinitialized",
            EvictionCancelled::LostToDownload => "lost_to_download",
-            EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access",
        }
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -23,7 +23,7 @@ use tokio::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::{id::TenantTimelineId, sync::gate::Gate};
+use utils::id::TenantTimelineId;

 use std::cmp::{max, min, Ordering};
 use std::collections::{BinaryHeap, HashMap, HashSet};
@@ -36,6 +36,7 @@ use std::time::{Duration, Instant, SystemTime};
 use crate::context::{
    AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
 };
+use crate::deletion_queue::DeletionQueueClient;
 use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
    AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
@@ -49,7 +50,6 @@ use crate::tenant::{
    metadata::{save_metadata, TimelineMetadata},
    par_fsync,
 };
-use crate::{deletion_queue::DeletionQueueClient, tenant::remote_timeline_client::StopError};

 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum};
@@ -95,7 +95,12 @@ use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenant
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub(super) enum FlushLoopState {
    NotStarted,
-    Running,
+    Running {
+        #[cfg(test)]
+        expect_initdb_optimization: bool,
+        #[cfg(test)]
+        initdb_optimization_count: usize,
+    },
    Exited,
 }

@@ -242,7 +247,7 @@ pub struct Timeline {
    /// the flush finishes. You can use that to wait for the flush to finish.
    layer_flush_start_tx: tokio::sync::watch::Sender<u64>,
    /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
-    layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,
+    layer_flush_done_tx: tokio::sync::watch::Sender<(u64, anyhow::Result<()>)>,

    /// Layer removal lock.
    /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
@@ -305,13 +310,6 @@ pub struct Timeline {
    /// Load or creation time information about the disk_consistent_lsn and when the loading
    /// happened. Used for consumption metrics.
    pub(crate) loaded_at: (Lsn, SystemTime),
-
-    /// Gate to prevent shutdown completing while I/O is still happening to this timeline's data
-    pub(crate) gate: Gate,
-
-    /// Cancellation token scoped to this timeline: anything doing long-running work relating
-    /// to the timeline should drop out when this token fires.
-    pub(crate) cancel: CancellationToken,
 }

 pub struct WalReceiverInfo {
@@ -369,19 +367,6 @@ pub enum PageReconstructError {
    WalRedo(anyhow::Error),
 }

-#[derive(thiserror::Error, Debug)]
-enum FlushLayerError {
-    /// Timeline cancellation token was cancelled
-    #[error("timeline shutting down")]
-    Cancelled,
-
-    #[error(transparent)]
-    PageReconstructError(#[from] PageReconstructError),
-
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
-}
-
 impl std::fmt::Debug for PageReconstructError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
        match self {
@@ -801,11 +786,7 @@ impl Timeline {
                // as an empty timeline. Also in unit tests, when we use the timeline
                // as a simple key-value store, ignoring the datadir layout. Log the
                // error but continue.
-                //
-                // Suppress error when it's due to cancellation
-                if !self.cancel.is_cancelled() {
-                    error!("could not compact, repartitioning keyspace failed: {err:?}");
-                }
+                error!("could not compact, repartitioning keyspace failed: {err:?}");
            }
        };

@@ -899,17 +880,11 @@ impl Timeline {
        self.launch_eviction_task(background_jobs_can_start);
    }

-    /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
-    /// also to remote storage.  This method can easily take multiple seconds for a busy timeline.
-    ///
-    /// While we are flushing, we continue to accept read I/O.
    #[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
-    pub(crate) async fn flush_and_shutdown(&self) {
+    pub async fn shutdown(self: &Arc<Self>, freeze_and_flush: bool) {
        debug_assert_current_span_has_tenant_and_timeline_id();

-        // Stop ingesting data, so that we are not still writing to an InMemoryLayer while
-        // trying to flush
-        tracing::debug!("Waiting for WalReceiverManager...");
+        // prevent writes to the InMemoryLayer
        task_mgr::shutdown_tasks(
            Some(TaskKind::WalReceiverManager),
            Some(self.tenant_id),
@@ -917,74 +892,34 @@ impl Timeline {
        )
        .await;

-        // Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance
-        self.last_record_lsn.shutdown();
-
        // now all writers to InMemory layer are gone, do the final flush if requested
-        match self.freeze_and_flush().await {
-            Ok(_) => {
-                // drain the upload queue
-                if let Some(client) = self.remote_client.as_ref() {
-                    // if we did not wait for completion here, it might be our shutdown process
-                    // didn't wait for remote uploads to complete at all, as new tasks can forever
-                    // be spawned.
-                    //
-                    // what is problematic is the shutting down of RemoteTimelineClient, because
-                    // obviously it does not make sense to stop while we wait for it, but what
-                    // about corner cases like s3 suddenly hanging up?
-                    if let Err(e) = client.wait_completion().await {
-                        // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
-                        // we have some extra WAL replay to do next time the timeline starts.
-                        warn!("failed to flush to remote storage: {e:#}");
-                    }
-                }
-            }
-            Err(e) => {
-                // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
-                // we have some extra WAL replay to do next time the timeline starts.
-                warn!("failed to freeze and flush: {e:#}");
-            }
-        }
-
-        self.shutdown().await;
-    }
-
-    /// Shut down immediately, without waiting for any open layers to flush to disk.  This is a subset of
-    /// the graceful [`Timeline::flush_and_shutdown`] function.
-    pub(crate) async fn shutdown(&self) {
-        // Signal any subscribers to our cancellation token to drop out
-        tracing::debug!("Cancelling CancellationToken");
-        self.cancel.cancel();
-
-        // Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel
-        // while doing so.
-        self.last_record_lsn.shutdown();
-
-        // Shut down the layer flush task before the remote client, as one depends on the other
-        task_mgr::shutdown_tasks(
-            Some(TaskKind::LayerFlushTask),
-            Some(self.tenant_id),
-            Some(self.timeline_id),
-        )
-        .await;
-
-        // Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in
-        // case our caller wants to use that for a deletion
-        if let Some(remote_client) = self.remote_client.as_ref() {
-            match remote_client.stop() {
+        if freeze_and_flush {
+            match self.freeze_and_flush().await {
                Ok(()) => {}
-                Err(StopError::QueueUninitialized) => {
-                    // Shutting down during initialization is legal
+                Err(e) => {
+                    warn!("failed to freeze and flush: {e:#}");
+                    return; // TODO: should probably drain remote timeline client anyways?
                }
            }
+
+            // drain the upload queue
+            let res = if let Some(client) = self.remote_client.as_ref() {
+                // if we did not wait for completion here, it might be our shutdown process
+                // didn't wait for remote uploads to complete at all, as new tasks can forever
+                // be spawned.
+                //
+                // what is problematic is the shutting down of RemoteTimelineClient, because
+                // obviously it does not make sense to stop while we wait for it, but what
+                // about corner cases like s3 suddenly hanging up?
+                client.wait_completion().await
+            } else {
+                Ok(())
+            };
+
+            if let Err(e) = res {
+                warn!("failed to await for frozen and flushed uploads: {e:#}");
+            }
        }
-
-        tracing::debug!("Waiting for tasks...");
-
-        task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(self.timeline_id)).await;
-
-        // Finally wait until any gate-holders are complete
-        self.gate.close().await;
    }

    pub fn set_state(&self, new_state: TimelineState) {
@@ -1024,12 +959,7 @@ impl Timeline {
            reason,
            backtrace: backtrace_str,
        };
-        self.set_state(broken_state);
-
-        // Although the Broken state is not equivalent to shutdown() (shutdown will be called
-        // later when this tenant is detach or the process shuts down), firing the cancellation token
-        // here avoids the need for other tasks to watch for the Broken state explicitly.
-        self.cancel.cancel();
+        self.set_state(broken_state)
    }

    pub fn current_state(&self) -> TimelineState {
@@ -1118,11 +1048,6 @@ impl Timeline {
    /// Like [`evict_layer_batch`](Self::evict_layer_batch), but for just one layer.
    /// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`.
    pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
-        let _gate = self
-            .gate
-            .enter()
-            .map_err(|_| anyhow::anyhow!("Shutting down"))?;
-
        let Some(local_layer) = self.find_layer(layer_file_name).await else {
            return Ok(None);
        };
@@ -1138,8 +1063,9 @@ impl Timeline {
            .as_ref()
            .ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))?;

+        let cancel = CancellationToken::new();
        let results = self
-            .evict_layer_batch(remote_client, &[local_layer])
+            .evict_layer_batch(remote_client, &[local_layer], &cancel)
            .await?;
        assert_eq!(results.len(), 1);
        let result: Option<Result<(), EvictionError>> = results.into_iter().next().unwrap();
@@ -1154,18 +1080,15 @@ impl Timeline {
    pub(crate) async fn evict_layers(
        &self,
        layers_to_evict: &[Layer],
+        cancel: &CancellationToken,
    ) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
-        let _gate = self
-            .gate
-            .enter()
-            .map_err(|_| anyhow::anyhow!("Shutting down"))?;
-
        let remote_client = self
            .remote_client
            .as_ref()
            .context("timeline must have RemoteTimelineClient")?;

-        self.evict_layer_batch(remote_client, layers_to_evict).await
+        self.evict_layer_batch(remote_client, layers_to_evict, cancel)
+            .await
    }

    /// Evict multiple layers at once, continuing through errors.
@@ -1186,6 +1109,7 @@ impl Timeline {
        &self,
        remote_client: &Arc<RemoteTimelineClient>,
        layers_to_evict: &[Layer],
+        cancel: &CancellationToken,
    ) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
        // ensure that the layers have finished uploading
        // (don't hold the layer_removal_cs while we do it, we're not removing anything yet)
@@ -1233,7 +1157,7 @@ impl Timeline {
        };

        tokio::select! {
-            _ = self.cancel.cancelled() => {},
+            _ = cancel.cancelled() => {},
            _ = join => {}
        }

@@ -1343,7 +1267,6 @@ impl Timeline {
        initial_logical_size_can_start: Option<completion::Barrier>,
        initial_logical_size_attempt: Option<completion::Completion>,
        state: TimelineState,
-        cancel: CancellationToken,
    ) -> Arc<Self> {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
        let (state, _) = watch::channel(state);
@@ -1444,8 +1367,6 @@ impl Timeline {

                initial_logical_size_can_start,
                initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
-                cancel,
-                gate: Gate::new(format!("Timeline<{tenant_id}/{timeline_id}>")),
            };
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -1461,7 +1382,7 @@ impl Timeline {
        let mut flush_loop_state = self.flush_loop_state.lock().unwrap();
        match *flush_loop_state {
            FlushLoopState::NotStarted => (),
-            FlushLoopState::Running => {
+            FlushLoopState::Running { .. } => {
                info!(
                    "skipping attempt to start flush_loop twice {}/{}",
                    self.tenant_id, self.timeline_id
@@ -1481,7 +1402,12 @@ impl Timeline {
        let self_clone = Arc::clone(self);

        debug!("spawning flush loop");
-        *flush_loop_state = FlushLoopState::Running;
+        *flush_loop_state = FlushLoopState::Running {
+            #[cfg(test)]
+            expect_initdb_optimization: false,
+            #[cfg(test)]
+            initdb_optimization_count: 0,
+        };
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::LayerFlushTask,
@@ -1493,7 +1419,7 @@ impl Timeline {
                let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
                self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await;
                let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
-                assert!(matches!(*flush_loop_state, FlushLoopState::Running));
+                assert!(matches!(*flush_loop_state, FlushLoopState::Running{ ..}));
                *flush_loop_state  = FlushLoopState::Exited;
                Ok(())
            }
@@ -1780,8 +1706,12 @@ impl Timeline {
                // delay will be terminated by a timeout regardless.
                let _completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() };

+                // no extra cancellation here, because nothing really waits for this to complete compared
+                // to spawn_ondemand_logical_size_calculation.
+                let cancel = CancellationToken::new();
+
                let calculated_size = match self_clone
-                    .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx)
+                    .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel)
                    .await
                {
                    Ok(s) => s,
@@ -1850,6 +1780,7 @@ impl Timeline {
        lsn: Lsn,
        cause: LogicalSizeCalculationCause,
        ctx: RequestContext,
+        cancel: CancellationToken,
    ) -> oneshot::Receiver<Result<u64, CalculateLogicalSizeError>> {
        let (sender, receiver) = oneshot::channel();
        let self_clone = Arc::clone(self);
@@ -1870,7 +1801,7 @@ impl Timeline {
            false,
            async move {
                let res = self_clone
-                    .logical_size_calculation_task(lsn, cause, &ctx)
+                    .logical_size_calculation_task(lsn, cause, &ctx, cancel)
                    .await;
                let _ = sender.send(res).ok();
                Ok(()) // Receiver is responsible for handling errors
@@ -1886,28 +1817,58 @@ impl Timeline {
        lsn: Lsn,
        cause: LogicalSizeCalculationCause,
        ctx: &RequestContext,
+        cancel: CancellationToken,
    ) -> Result<u64, CalculateLogicalSizeError> {
        span::debug_assert_current_span_has_tenant_and_timeline_id();

-        let _guard = self.gate.enter();
-
+        let mut timeline_state_updates = self.subscribe_for_state_updates();
        let self_calculation = Arc::clone(self);

        let mut calculation = pin!(async {
+            let cancel = cancel.child_token();
            let ctx = ctx.attached_child();
            self_calculation
-                .calculate_logical_size(lsn, cause, &ctx)
+                .calculate_logical_size(lsn, cause, cancel, &ctx)
                .await
        });
+        let timeline_state_cancellation = async {
+            loop {
+                match timeline_state_updates.changed().await {
+                    Ok(()) => {
+                        let new_state = timeline_state_updates.borrow().clone();
+                        match new_state {
+                            // we're running this job for active timelines only
+                            TimelineState::Active => continue,
+                            TimelineState::Broken { .. }
+                            | TimelineState::Stopping
+                            | TimelineState::Loading => {
+                                break format!("aborted because timeline became inactive (new state: {new_state:?})")
+                            }
+                        }
+                    }
+                    Err(_sender_dropped_error) => {
+                        // can't happen, the sender is not dropped as long as the Timeline exists
+                        break "aborted because state watch was dropped".to_string();
+                    }
+                }
+            }
+        };
+
+        let taskmgr_shutdown_cancellation = async {
+            task_mgr::shutdown_watcher().await;
+            "aborted because task_mgr shutdown requested".to_string()
+        };

        tokio::select! {
            res = &mut calculation => { res }
-            _ = self.cancel.cancelled() => {
-                debug!("cancelling logical size calculation for timeline shutdown");
+            reason = timeline_state_cancellation => {
+                debug!(reason = reason, "cancelling calculation");
+                cancel.cancel();
                calculation.await
            }
-            _ = task_mgr::shutdown_watcher() => {
-                debug!("cancelling logical size calculation for task shutdown");
+            reason = taskmgr_shutdown_cancellation => {
+                debug!(reason = reason, "cancelling calculation");
+                cancel.cancel();
                calculation.await
            }
        }
@@ -1921,6 +1882,7 @@ impl Timeline {
        &self,
        up_to_lsn: Lsn,
        cause: LogicalSizeCalculationCause,
+        cancel: CancellationToken,
        ctx: &RequestContext,
    ) -> Result<u64, CalculateLogicalSizeError> {
        info!(
@@ -1963,7 +1925,7 @@ impl Timeline {
        };
        let timer = storage_time_metrics.start_timer();
        let logical_size = self
-            .get_current_logical_size_non_incremental(up_to_lsn, ctx)
+            .get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx)
            .await?;
        debug!("calculated logical size: {logical_size}");
        timer.stop_and_record();
@@ -2068,10 +2030,6 @@ impl Timeline {
        let mut cont_lsn = Lsn(request_lsn.0 + 1);

        'outer: loop {
-            if self.cancel.is_cancelled() {
-                return Err(PageReconstructError::Cancelled);
-            }
-
            // The function should have updated 'state'
            //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
            match result {
@@ -2376,10 +2334,6 @@ impl Timeline {
        info!("started flush loop");
        loop {
            tokio::select! {
-                _ = self.cancel.cancelled() => {
-                    info!("shutting down layer flush task");
-                    break;
-                },
                _ = task_mgr::shutdown_watcher() => {
                    info!("shutting down layer flush task");
                    break;
@@ -2391,14 +2345,6 @@ impl Timeline {
            let timer = self.metrics.flush_time_histo.start_timer();
            let flush_counter = *layer_flush_start_rx.borrow();
            let result = loop {
-                if self.cancel.is_cancelled() {
-                    info!("dropping out of flush loop for timeline shutdown");
-                    // Note: we do not bother transmitting into [`layer_flush_done_tx`], because
-                    // anyone waiting on that will respect self.cancel as well: they will stop
-                    // waiting at the same time we as drop out of this loop.
-                    return;
-                }
-
                let layer_to_flush = {
                    let guard = self.layers.read().await;
                    guard.layer_map().frozen_layers.front().cloned()
@@ -2407,18 +2353,9 @@ impl Timeline {
                let Some(layer_to_flush) = layer_to_flush else {
                    break Ok(());
                };
-                match self.flush_frozen_layer(layer_to_flush, ctx).await {
-                    Ok(()) => {}
-                    Err(FlushLayerError::Cancelled) => {
-                        info!("dropping out of flush loop for timeline shutdown");
-                        return;
-                    }
-                    err @ Err(
-                        FlushLayerError::Other(_) | FlushLayerError::PageReconstructError(_),
-                    ) => {
-                        error!("could not flush frozen layer: {err:?}");
-                        break err;
-                    }
+                if let Err(err) = self.flush_frozen_layer(layer_to_flush, ctx).await {
+                    error!("could not flush frozen layer: {err:?}");
+                    break Err(err);
                }
            };
            // Notify any listeners that we're done
@@ -2440,7 +2377,7 @@ impl Timeline {
        let mut my_flush_request = 0;

        let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
-        if !matches!(flush_loop_state, FlushLoopState::Running) {
+        if !matches!(flush_loop_state, FlushLoopState::Running { .. }) {
            anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
        }

@@ -2467,17 +2404,7 @@ impl Timeline {
                }
            }
            trace!("waiting for flush to complete");
-            tokio::select! {
-                rx_e = rx.changed() => {
-                    rx_e?;
-                },
-                // Cancellation safety: we are not leaving an I/O in-flight for the flush, we're just ignoring
-                // the notification from [`flush_loop`] that it completed.
-                _ = self.cancel.cancelled() => {
-                    tracing::info!("Cancelled layer flush due on timeline shutdown");
-                    return Ok(())
-                }
-            };
+            rx.changed().await?;
            trace!("done")
        }
    }
@@ -2492,13 +2419,61 @@ impl Timeline {
        self: &Arc<Self>,
        frozen_layer: Arc<InMemoryLayer>,
        ctx: &RequestContext,
-    ) -> Result<(), FlushLayerError> {
+    ) -> anyhow::Result<()> {
+        // As a special case, when we have just imported an image into the repository,
+        // instead of writing out a L0 delta layer, we directly write out image layer
+        // files instead. This is possible as long as *all* the data imported into the
+        // repository have the same LSN.
        let lsn_range = frozen_layer.get_lsn_range();
-        let layer = self.create_delta_layer(&frozen_layer, ctx).await?;
-
-        if self.cancel.is_cancelled() {
-            return Err(FlushLayerError::Cancelled);
-        }
+        let (layers_to_upload, delta_layer_to_add) =
+            if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
+                #[cfg(test)]
+                match &mut *self.flush_loop_state.lock().unwrap() {
+                    FlushLoopState::NotStarted | FlushLoopState::Exited => {
+                        panic!("flush loop not running")
+                    }
+                    FlushLoopState::Running {
+                        initdb_optimization_count,
+                        ..
+                    } => {
+                        *initdb_optimization_count += 1;
+                    }
+                }
+                // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
+                // require downloading anything during initial import.
+                let (partitioning, _lsn) = self
+                    .repartition(self.initdb_lsn, self.get_compaction_target_size(), ctx)
+                    .await?;
+                // For image layers, we add them immediately into the layer map.
+                (
+                    self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
+                        .await?,
+                    None,
+                )
+            } else {
+                #[cfg(test)]
+                match &mut *self.flush_loop_state.lock().unwrap() {
+                    FlushLoopState::NotStarted | FlushLoopState::Exited => {
+                        panic!("flush loop not running")
+                    }
+                    FlushLoopState::Running {
+                        expect_initdb_optimization,
+                        ..
+                    } => {
+                        assert!(!*expect_initdb_optimization, "expected initdb optimization");
+                    }
+                }
+                // Normal case, write out a L0 delta layer file.
+                // `create_delta_layer` will not modify the layer map.
+                // We will remove frozen layer and add delta layer in one atomic operation later.
+                let layer = self.create_delta_layer(&frozen_layer, ctx).await?;
+                (
+                    // FIXME: even though we have a single image and single delta layer assumption
+                    // we push them to vec
+                    vec![layer.clone()],
+                    Some(layer),
+                )
+            };

        let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
        let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
@@ -2509,21 +2484,18 @@ impl Timeline {
        let metadata = {
            let mut guard = self.layers.write().await;

-            if self.cancel.is_cancelled() {
-                return Err(FlushLayerError::Cancelled);
-            }
-
-            guard.finish_flush_l0_layer(&layer, &frozen_layer, &self.metrics);
+            guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics);

            if disk_consistent_lsn != old_disk_consistent_lsn {
                assert!(disk_consistent_lsn > old_disk_consistent_lsn);
                self.disk_consistent_lsn.store(disk_consistent_lsn);

                // Schedule remote uploads that will reflect our new disk_consistent_lsn
-                Some(self.schedule_uploads(disk_consistent_lsn, [layer])?)
+                Some(self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?)
            } else {
                None
            }
+            // release lock on 'layers'
        };

        // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
@@ -2964,10 +2936,13 @@ struct CompactLevel0Phase1StatsBuilder {
    new_deltas_size: Option<u64>,
 }

+#[serde_as]
 #[derive(serde::Serialize)]
 struct CompactLevel0Phase1Stats {
    version: u64,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    tenant_id: TenantId,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    timeline_id: TimelineId,
    read_lock_acquisition_micros: RecordedDuration,
    read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
@@ -4394,10 +4369,25 @@ mod tests {
            .expect("should had been resident")
            .drop_eviction_guard();

+        let cancel = tokio_util::sync::CancellationToken::new();
        let batch = [layer];

-        let first = async { timeline.evict_layer_batch(&rc, &batch).await.unwrap() };
-        let second = async { timeline.evict_layer_batch(&rc, &batch).await.unwrap() };
+        let first = {
+            let cancel = cancel.child_token();
+            async {
+                let cancel = cancel;
+                timeline
+                    .evict_layer_batch(&rc, &batch, &cancel)
+                    .await
+                    .unwrap()
+            }
+        };
+        let second = async {
+            timeline
+                .evict_layer_batch(&rc, &batch, &cancel)
+                .await
+                .unwrap()
+        };

        let (first, second) = tokio::join!(first, second);

--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -17,7 +17,6 @@ use crate::{
    deletion_queue::DeletionQueueClient,
    task_mgr::{self, TaskKind},
    tenant::{
-        debug_assert_current_span_has_tenant_and_timeline_id,
        metadata::TimelineMetadata,
        remote_timeline_client::{
            self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
@@ -31,11 +30,6 @@ use super::{Timeline, TimelineResources};

 /// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
 async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
-    debug_assert_current_span_has_tenant_and_timeline_id();
-    // Notify any timeline work to drop out of loops/requests
-    tracing::debug!("Cancelling CancellationToken");
-    timeline.cancel.cancel();
-
    // Stop the walreceiver first.
    debug!("waiting for wal receiver to shutdown");
    let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
@@ -80,11 +74,6 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
            "failpoint: timeline-delete-before-index-deleted-at"
        ))?
    });
-
-    tracing::debug!("Waiting for gate...");
-    timeline.gate.close().await;
-    tracing::debug!("Shutdown complete");
-
    Ok(())
 }

--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -277,7 +277,10 @@ impl Timeline {
            Some(c) => c,
        };

-        let results = match self.evict_layer_batch(remote_client, &candidates).await {
+        let results = match self
+            .evict_layer_batch(remote_client, &candidates, cancel)
+            .await
+        {
            Err(pre_err) => {
                stats.errors += candidates.len();
                error!("could not do any evictions: {pre_err:#}");
@@ -326,7 +329,8 @@ impl Timeline {
        match state.last_layer_access_imitation {
            Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
            _ => {
-                self.imitate_timeline_cached_layer_accesses(ctx).await;
+                self.imitate_timeline_cached_layer_accesses(cancel, ctx)
+                    .await;
                state.last_layer_access_imitation = Some(tokio::time::Instant::now())
            }
        }
@@ -340,7 +344,20 @@ impl Timeline {
        // Make one of the tenant's timelines draw the short straw and run the calculation.
        // The others wait until the calculation is done so that they take into account the
        // imitated accesses that the winner made.
-        let tenant = match crate::tenant::mgr::get_tenant(self.tenant_id, true) {
+        //
+        // It is critical we are responsive to cancellation here. Otherwise, we deadlock with
+        // tenant deletion (holds TENANTS in read mode) any other task that attempts to
+        // acquire TENANTS in write mode before we here call get_tenant.
+        // See https://github.com/neondatabase/neon/issues/5284.
+        let res = tokio::select! {
+            _ = cancel.cancelled() => {
+                return ControlFlow::Break(());
+            }
+            res = crate::tenant::mgr::get_tenant(self.tenant_id, true) => {
+                res
+            }
+        };
+        let tenant = match res {
            Ok(t) => t,
            Err(_) => {
                return ControlFlow::Break(());
@@ -366,12 +383,21 @@ impl Timeline {

    /// Recompute the values which would cause on-demand downloads during restart.
    #[instrument(skip_all)]
-    async fn imitate_timeline_cached_layer_accesses(&self, ctx: &RequestContext) {
+    async fn imitate_timeline_cached_layer_accesses(
+        &self,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) {
        let lsn = self.get_last_record_lsn();

        // imitiate on-restart initial logical size
        let size = self
-            .calculate_logical_size(lsn, LogicalSizeCalculationCause::EvictionTaskImitation, ctx)
+            .calculate_logical_size(
+                lsn,
+                LogicalSizeCalculationCause::EvictionTaskImitation,
+                cancel.clone(),
+                ctx,
+            )
            .instrument(info_span!("calculate_logical_size"))
            .await;

--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -164,7 +164,7 @@ impl LayerManager {
    /// Flush a frozen layer and add the written delta layer to the layer map.
    pub(crate) fn finish_flush_l0_layer(
        &mut self,
-        delta_layer: &ResidentLayer,
+        delta_layer: Option<&ResidentLayer>,
        frozen_layer_for_check: &Arc<InMemoryLayer>,
        metrics: &TimelineMetrics,
    ) {
@@ -179,14 +179,12 @@ impl LayerManager {
        // layer to disk at the same time, that would not work.
        assert_eq!(Arc::as_ptr(&inmem), Arc::as_ptr(frozen_layer_for_check));

-        let mut updates = self.layer_map.batch_update();
-        Self::insert_historic_layer(
-            delta_layer.as_ref().clone(),
-            &mut updates,
-            &mut self.layer_fmgr,
-        );
-        metrics.record_new_file_metrics(delta_layer.layer_desc().file_size);
-        updates.flush();
+        if let Some(l) = delta_layer {
+            let mut updates = self.layer_map.batch_update();
+            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
+            metrics.record_new_file_metrics(l.layer_desc().file_size);
+            updates.flush();
+        }
    }

    /// Called when compaction is completed.
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -426,7 +426,7 @@ impl ConnectionManagerState {
                    timeline,
                    new_sk.wal_source_connconf,
                    events_sender,
-                    cancellation.clone(),
+                    cancellation,
                    connect_timeout,
                    ctx,
                    node_id,
@@ -447,14 +447,7 @@ impl ConnectionManagerState {
                            }
                            WalReceiverError::Other(e) => {
                                // give out an error to have task_mgr give it a really verbose logging
-                                if cancellation.is_cancelled() {
-                                    // Ideally we would learn about this via some path other than Other, but
-                                    // that requires refactoring all the intermediate layers of ingest code
-                                    // that only emit anyhow::Error
-                                    Ok(())
-                                } else {
-                                    Err(e).context("walreceiver connection handling failure")
-                                }
+                                Err(e).context("walreceiver connection handling failure")
                            }
                        }
                    }
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -19,7 +19,6 @@ use std::io::{Error, ErrorKind, Seek, SeekFrom};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{RwLock, RwLockWriteGuard};
-use utils::fs_ext;

 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
@@ -174,78 +173,37 @@ impl OpenFiles {
    }
 }

-/// Identify error types that should alwways terminate the process.  Other
-/// error types may be elegible for retry.
-pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool {
-    use nix::errno::Errno::*;
-    match e.raw_os_error().map(nix::errno::from_i32) {
-        Some(EIO) => {
-            // Terminate on EIO because we no longer trust the device to store
-            // data safely, or to uphold persistence guarantees on fsync.
-            true
-        }
-        Some(EROFS) => {
-            // Terminate on EROFS because a filesystem is usually remounted
-            // readonly when it has experienced some critical issue, so the same
-            // logic as EIO applies.
-            true
-        }
-        Some(EACCES) => {
-            // Terminate on EACCESS because we should always have permissions
-            // for our own data dir: if we don't, then we can't do our job and
-            // need administrative intervention to fix permissions.  Terminating
-            // is the best way to make sure we stop cleanly rather than going
-            // into infinite retry loops, and will make it clear to the outside
-            // world that we need help.
-            true
-        }
-        _ => {
-            // Treat all other local file I/O errors are retryable.  This includes:
-            // - ENOSPC: we stay up and wait for eviction to free some space
-            // - EINVAL, EBADF, EBADFD: this is a code bug, not a filesystem/hardware issue
-            // - WriteZero, Interrupted: these are used internally VirtualFile
-            false
-        }
-    }
+#[derive(Debug, thiserror::Error)]
+pub enum CrashsafeOverwriteError {
+    #[error("final path has no parent dir")]
+    FinalPathHasNoParentDir,
+    #[error("remove tempfile")]
+    RemovePreviousTempfile(#[source] std::io::Error),
+    #[error("create tempfile")]
+    CreateTempfile(#[source] std::io::Error),
+    #[error("write tempfile")]
+    WriteContents(#[source] std::io::Error),
+    #[error("sync tempfile")]
+    SyncTempfile(#[source] std::io::Error),
+    #[error("rename tempfile to final path")]
+    RenameTempfileToFinalPath(#[source] std::io::Error),
+    #[error("open final path parent dir")]
+    OpenFinalPathParentDir(#[source] std::io::Error),
+    #[error("sync final path parent dir")]
+    SyncFinalPathParentDir(#[source] std::io::Error),
 }
-
-/// Call this when the local filesystem gives us an error with an external
-/// cause: this includes EIO, EROFS, and EACCESS: all these indicate either
-/// bad storage or bad configuration, and we can't fix that from inside
-/// a running process.
-pub(crate) fn on_fatal_io_error(e: &std::io::Error, context: &str) -> ! {
-    tracing::error!("Fatal I/O error: {e}: {context})");
-    std::process::abort();
-}
-
-pub(crate) trait MaybeFatalIo<T> {
-    fn maybe_fatal_err(self, context: &str) -> std::io::Result<T>;
-    fn fatal_err(self, context: &str) -> T;
-}
-
-impl<T> MaybeFatalIo<T> for std::io::Result<T> {
-    /// Terminate the process if the result is an error of a fatal type, else pass it through
-    ///
-    /// This is appropriate for writes, where we typically want to die on EIO/ACCES etc, but
-    /// not on ENOSPC.
-    fn maybe_fatal_err(self, context: &str) -> std::io::Result<T> {
-        if let Err(e) = &self {
-            if is_fatal_io_error(e) {
-                on_fatal_io_error(e, context);
-            }
-        }
-        self
-    }
-
-    /// Terminate the process on any I/O error.
-    ///
-    /// This is appropriate for reads on files that we know exist: they should always work.
-    fn fatal_err(self, context: &str) -> T {
+impl CrashsafeOverwriteError {
+    /// Returns true iff the new contents are durably stored.
+    pub fn are_new_contents_durable(&self) -> bool {
        match self {
-            Ok(v) => v,
-            Err(e) => {
-                on_fatal_io_error(&e, context);
-            }
+            Self::FinalPathHasNoParentDir => false,
+            Self::RemovePreviousTempfile(_) => false,
+            Self::CreateTempfile(_) => false,
+            Self::WriteContents(_) => false,
+            Self::SyncTempfile(_) => false,
+            Self::RenameTempfileToFinalPath(_) => false,
+            Self::OpenFinalPathParentDir(_) => false,
+            Self::SyncFinalPathParentDir(_) => true,
        }
    }
 }
@@ -326,13 +284,15 @@ impl VirtualFile {
        final_path: &Utf8Path,
        tmp_path: &Utf8Path,
        content: &[u8],
-    ) -> std::io::Result<()> {
+    ) -> Result<(), CrashsafeOverwriteError> {
        let Some(final_path_parent) = final_path.parent() else {
-            return Err(std::io::Error::from_raw_os_error(
-                nix::errno::Errno::EINVAL as i32,
-            ));
+            return Err(CrashsafeOverwriteError::FinalPathHasNoParentDir);
        };
-        std::fs::remove_file(tmp_path).or_else(fs_ext::ignore_not_found)?;
+        match std::fs::remove_file(tmp_path) {
+            Ok(()) => {}
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
+            Err(e) => return Err(CrashsafeOverwriteError::RemovePreviousTempfile(e)),
+        }
        let mut file = Self::open_with_options(
            tmp_path,
            OpenOptions::new()
@@ -341,20 +301,31 @@ impl VirtualFile {
                // we bail out instead of causing damage.
                .create_new(true),
        )
-        .await?;
-        file.write_all(content).await?;
-        file.sync_all().await?;
+        .await
+        .map_err(CrashsafeOverwriteError::CreateTempfile)?;
+        file.write_all(content)
+            .await
+            .map_err(CrashsafeOverwriteError::WriteContents)?;
+        file.sync_all()
+            .await
+            .map_err(CrashsafeOverwriteError::SyncTempfile)?;
        drop(file); // before the rename, that's important!
                    // renames are atomic
-        std::fs::rename(tmp_path, final_path)?;
+        std::fs::rename(tmp_path, final_path)
+            .map_err(CrashsafeOverwriteError::RenameTempfileToFinalPath)?;
        // Only open final path parent dirfd now, so that this operation only
        // ever holds one VirtualFile fd at a time.  That's important because
        // the current `find_victim_slot` impl might pick the same slot for both
        // VirtualFile., and it eventually does a blocking write lock instead of
        // try_lock.
        let final_parent_dirfd =
-            Self::open_with_options(final_path_parent, OpenOptions::new().read(true)).await?;
-        final_parent_dirfd.sync_all().await?;
+            Self::open_with_options(final_path_parent, OpenOptions::new().read(true))
+                .await
+                .map_err(CrashsafeOverwriteError::OpenFinalPathParentDir)?;
+        final_parent_dirfd
+            .sync_all()
+            .await
+            .map_err(CrashsafeOverwriteError::SyncFinalPathParentDir)?;
        Ok(())
    }

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Bojan Serafimov	b1de46c18d	wip	2023-11-01 20:50:20 -04:00
Bojan Serafimov	88064d8c1d	wip	2023-11-01 17:13:56 -04:00